tuananhvp25081995 преди 1 година
ревизия
0dd9fabdba
променени са 3 файла, в които са добавени 139 реда и са изтрити 0 реда
  1. 10 0
      go.mod
  2. 35 0
      go.sum
  3. 94 0
      main.go

+ 10 - 0
go.mod

@@ -0,0 +1,10 @@
+module study-test-go/goquery
+
+go 1.19
+
+require github.com/PuerkitoBio/goquery v1.8.1
+
+require (
+	github.com/andybalholm/cascadia v1.3.1 // indirect
+	golang.org/x/net v0.7.0 // indirect
+)

+ 35 - 0
go.sum

@@ -0,0 +1,35 @@
+github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
+github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
+github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
+github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
+golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

+ 94 - 0
main.go

@@ -0,0 +1,94 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"net/http"
+	"regexp"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+func main() {
+	// Wordpress
+	// url := "https://addand.kr/shop/new-%ed%95%9c-%ea%b6%8c%ec%9c%bc%eb%a1%9c-%eb%81%9d%eb%82%98%eb%8a%94-%eb%85%b8%ec%85%98/"
+
+	url := "https://www.cafe24h.com.vn/ca-phe-truyen-thong/"
+	// Send an HTTP GET request to the URL
+	response, err := http.Get(url)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer response.Body.Close()
+
+	doc, err := goquery.NewDocumentFromReader(response.Body)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// These will the value of PDP parsing structures
+	cItemName := ".section-header>.section-title"
+	cShortDesc := ".content-detail.col-md-8 p"
+	cItemCategory := ".container>.section-header>.section-title"
+	cItemTags := ".tagged_as"
+	cItemImages := ".product-img a"
+	cItemTextDesc := ".content-detail.col-md-8"
+	// // cItemOptions := "select#rating option"
+	// cItemPrice := ".summary>.price .woocommerce-Price-amount.amount"
+	cEmail := "footer"
+	cRating := ".star-rating .rating"
+	cAuthor := ".product-brand a"
+	cVideo := "iframe"
+
+	// Use the Find method to select elements that match the css selector
+	// doc.Find(cItemName).Each(func(index int, element *goquery.Selection) {
+	// 	// Extract the text associated with the selected element
+	// 	text := element.Text()
+	// 	fmt.Printf("Text associated with %s: %s\n", cItemName, text)
+	// })
+
+	fmt.Println("ItemName: ", doc.Find(cItemName).First().Text())
+	fmt.Println("ShortDesc: ", doc.Find(cShortDesc).First().Text())
+	fmt.Println("ItemCategory: ", doc.Find(cItemCategory).First().Text())
+	fmt.Println("ItemTags: ", doc.Find(cItemTags).First().Text())
+
+	fmt.Println("TextDesc: ", doc.Find(cItemTextDesc).First().Text())
+	// fmt.Println("ItemPice: ", doc.Find(cItemPrice).First().Text())
+	fmt.Println("Email: ", findEmail(doc.Find(cEmail).First().Text(), ""))
+	fmt.Println("Rating : ", doc.Find(cRating).First().Text())
+
+	doc.Find("link ~ meta").Each(func(i int, s *goquery.Selection) {
+		v, _ := s.Attr("property")
+		fmt.Println(v, "  : ", s.AttrOr("content", ""))
+	})
+
+	fmt.Println("Author : ")
+	doc.Find(cAuthor).Each(func(index int, element *goquery.Selection) {
+		link, _ := element.Attr("href")
+		fmt.Println("====================")
+		fmt.Println("AuthorName: ", element.Text())
+		fmt.Printf("AuthorLink: %s\n", link)
+	})
+
+	vdos := []string{}
+	doc.Find(cVideo).Each(func(index int, element *goquery.Selection) {
+		embed, _ := element.Attr("src")
+		vdos = append(vdos, embed)
+	})
+	fmt.Printf("ItemVideos: %s\n", vdos)
+
+	imgs := []string{}
+	doc.Find(cItemImages).Each(func(index int, element *goquery.Selection) {
+		img, _ := element.Attr("href")
+		imgs = append(imgs, img)
+	})
+	fmt.Println("ItemImages: ", imgs)
+
+}
+
+func findEmail(body string, doms string) (emails []string) {
+	r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
+	emails = append(emails, r.FindStringSubmatch(body)...)
+
+	return
+}