|
@@ -0,0 +1,94 @@
|
|
|
|
+package main
|
|
|
|
+
|
|
|
|
+import (
|
|
|
|
+ "fmt"
|
|
|
|
+ "log"
|
|
|
|
+ "net/http"
|
|
|
|
+ "regexp"
|
|
|
|
+
|
|
|
|
+ "github.com/PuerkitoBio/goquery"
|
|
|
|
+)
|
|
|
|
+
|
|
|
|
+func main() {
|
|
|
|
+ // Wordpress
|
|
|
|
+ // url := "https://addand.kr/shop/new-%ed%95%9c-%ea%b6%8c%ec%9c%bc%eb%a1%9c-%eb%81%9d%eb%82%98%eb%8a%94-%eb%85%b8%ec%85%98/"
|
|
|
|
+
|
|
|
|
+ url := "https://www.cafe24h.com.vn/ca-phe-truyen-thong/"
|
|
|
|
+ // Send an HTTP GET request to the URL
|
|
|
|
+ response, err := http.Get(url)
|
|
|
|
+ if err != nil {
|
|
|
|
+ log.Fatal(err)
|
|
|
|
+ }
|
|
|
|
+ defer response.Body.Close()
|
|
|
|
+
|
|
|
|
+ doc, err := goquery.NewDocumentFromReader(response.Body)
|
|
|
|
+ if err != nil {
|
|
|
|
+ log.Fatal(err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // These will the value of PDP parsing structures
|
|
|
|
+ cItemName := ".section-header>.section-title"
|
|
|
|
+ cShortDesc := ".content-detail.col-md-8 p"
|
|
|
|
+ cItemCategory := ".container>.section-header>.section-title"
|
|
|
|
+ cItemTags := ".tagged_as"
|
|
|
|
+ cItemImages := ".product-img a"
|
|
|
|
+ cItemTextDesc := ".content-detail.col-md-8"
|
|
|
|
+ // // cItemOptions := "select#rating option"
|
|
|
|
+ // cItemPrice := ".summary>.price .woocommerce-Price-amount.amount"
|
|
|
|
+ cEmail := "footer"
|
|
|
|
+ cRating := ".star-rating .rating"
|
|
|
|
+ cAuthor := ".product-brand a"
|
|
|
|
+ cVideo := "iframe"
|
|
|
|
+
|
|
|
|
+ // Use the Find method to select elements that match the css selector
|
|
|
|
+ // doc.Find(cItemName).Each(func(index int, element *goquery.Selection) {
|
|
|
|
+ // // Extract the text associated with the selected element
|
|
|
|
+ // text := element.Text()
|
|
|
|
+ // fmt.Printf("Text associated with %s: %s\n", cItemName, text)
|
|
|
|
+ // })
|
|
|
|
+
|
|
|
|
+ fmt.Println("ItemName: ", doc.Find(cItemName).First().Text())
|
|
|
|
+ fmt.Println("ShortDesc: ", doc.Find(cShortDesc).First().Text())
|
|
|
|
+ fmt.Println("ItemCategory: ", doc.Find(cItemCategory).First().Text())
|
|
|
|
+ fmt.Println("ItemTags: ", doc.Find(cItemTags).First().Text())
|
|
|
|
+
|
|
|
|
+ fmt.Println("TextDesc: ", doc.Find(cItemTextDesc).First().Text())
|
|
|
|
+ // fmt.Println("ItemPice: ", doc.Find(cItemPrice).First().Text())
|
|
|
|
+ fmt.Println("Email: ", findEmail(doc.Find(cEmail).First().Text(), ""))
|
|
|
|
+ fmt.Println("Rating : ", doc.Find(cRating).First().Text())
|
|
|
|
+
|
|
|
|
+ doc.Find("link ~ meta").Each(func(i int, s *goquery.Selection) {
|
|
|
|
+ v, _ := s.Attr("property")
|
|
|
|
+ fmt.Println(v, " : ", s.AttrOr("content", ""))
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+ fmt.Println("Author : ")
|
|
|
|
+ doc.Find(cAuthor).Each(func(index int, element *goquery.Selection) {
|
|
|
|
+ link, _ := element.Attr("href")
|
|
|
|
+ fmt.Println("====================")
|
|
|
|
+ fmt.Println("AuthorName: ", element.Text())
|
|
|
|
+ fmt.Printf("AuthorLink: %s\n", link)
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+ vdos := []string{}
|
|
|
|
+ doc.Find(cVideo).Each(func(index int, element *goquery.Selection) {
|
|
|
|
+ embed, _ := element.Attr("src")
|
|
|
|
+ vdos = append(vdos, embed)
|
|
|
|
+ })
|
|
|
|
+ fmt.Printf("ItemVideos: %s\n", vdos)
|
|
|
|
+
|
|
|
|
+ imgs := []string{}
|
|
|
|
+ doc.Find(cItemImages).Each(func(index int, element *goquery.Selection) {
|
|
|
|
+ img, _ := element.Attr("href")
|
|
|
|
+ imgs = append(imgs, img)
|
|
|
|
+ })
|
|
|
|
+ fmt.Println("ItemImages: ", imgs)
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func findEmail(body string, doms string) (emails []string) {
|
|
|
|
+ r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
|
|
|
|
+ emails = append(emails, r.FindStringSubmatch(body)...)
|
|
|
|
+
|
|
|
|
+ return
|
|
|
|
+}
|