main.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. "net/http"
  6. "regexp"
  7. "github.com/PuerkitoBio/goquery"
  8. )
  9. func main() {
  10. // Wordpress
  11. // url := "https://addand.kr/shop/new-%ed%95%9c-%ea%b6%8c%ec%9c%bc%eb%a1%9c-%eb%81%9d%eb%82%98%eb%8a%94-%eb%85%b8%ec%85%98/"
  12. url := "https://www.cafe24h.com.vn/ca-phe-truyen-thong/"
  13. url = "https://ssline.kr/shop/view.php?index_no=36357" //cafe24
  14. // url = "https://koskomro.com" //cafe24
  15. // url := "https://bt-beloria-1.myshopify.com/collections/women-collection/products/sweater-classical-tshirt"
  16. // url := "https://seoulknit.com/shop/v-neck-summer-pullover/"
  17. // url = "https://www.elcanto.co.kr" //MakeShop
  18. // url = "https://www.ippngirl.co.kr" //MakeShop
  19. // url = "https://bt-beloria-1.myshopify.com" //shopify
  20. // url = "https://lachinatakorea.com" //Godomall
  21. // url = "https://sf-fd.com" //Godomall
  22. // url = "https://www.vanillagift.com" //Magento
  23. // url = "https://taiwan.coach.com" //Magento
  24. // url = "http://mas1.magikthemes.com" //Magento
  25. // url = "https://aladinmarket.co.kr" //young Cart
  26. // url = "http://damoagift.com" //young Cart
  27. // url = "https://p2u.daboryhost.com" /DaboryShop
  28. // url = "https://seoulknit.com" //Woocommerce
  29. // url = "http://webhost.dabory.com/" /Woocommerce
  30. // url = "https://addand.kr" // WooCommerce
  31. // url := "https://droppii.net.vn/cnd-ginseng-gold"
  32. // Send an HTTP GET request to the URL
  33. response, err := http.Get(url)
  34. if err != nil {
  35. log.Fatal(err)
  36. }
  37. defer response.Body.Close()
  38. doc, err := goquery.NewDocumentFromReader(response.Body)
  39. if err != nil {
  40. log.Fatal(err)
  41. }
  42. // These will the value of PDP parsing structures
  43. cItemName := ".tit"
  44. cShortDesc := ".vi_tab:last"
  45. cItemCategory := ".container>.section-header>.section-title"
  46. // cItemTags := ".vi_tab"
  47. cItemImages := ".vi_info .simg_li img"
  48. cItemTextDesc := ".mart15>.__se_tbl_ext"
  49. // // cItemOptions := "select#rating option"
  50. cItemPrice := ".price_bx .price"
  51. // cEmail := "footer"
  52. // cRating := ".star-rating .rating"
  53. // cAuthor := ".product-brand a"
  54. // cVideo := "iframe"
  55. // Use the Find method to select elements that match the css selector
  56. // doc.Find(cItemName).Each(func(index int, element *goquery.Selection) {
  57. // // Extract the text associated with the selected element
  58. // text := element.Text()
  59. // fmt.Printf("Text associated with %s: %s\n", cItemName, text)
  60. // })
  61. doc.Find("meta").Each(func(i int, s *goquery.Selection) {
  62. v, _ := s.Attr("property")
  63. if v == "author" {
  64. fmt.Println("Author : ", s.AttrOr("content", ""))
  65. }
  66. if v == "og:title" {
  67. fmt.Println("ItemName: ", s.AttrOr("content", ""))
  68. }
  69. if v == "og:description" {
  70. fmt.Println("TextDesc: ", s.AttrOr("content", ""))
  71. }
  72. if v == "og:image" {
  73. fmt.Println("Images: ", s.AttrOr("content", ""))
  74. }
  75. // fmt.Println()
  76. })
  77. fmt.Println("ItemName: ", doc.Find(cItemName).First().Text())
  78. fmt.Println("ShortDesc: ", doc.Find(cShortDesc).First().Text())
  79. fmt.Println("ItemCategory: ", doc.Find(cItemCategory).First().Text())
  80. // fmt.Println("ItemTags: ", doc.Find(cItemTags).First().Text())
  81. fmt.Println("TextDesc: ", doc.Find(cItemTextDesc).First().Text())
  82. fmt.Println("ItemPice: ", doc.Find(cItemPrice).First().Text())
  83. // fmt.Println("Email: ", findEmail(doc.Find(cEmail).First().Text(), ""))
  84. // fmt.Println("Rating : ", doc.Find(cRating).First().Text())
  85. // doc.Find("meta").Each(func(i int, s *goquery.Selection) {
  86. // v, _ := s.Attr("property")
  87. // fmt.Println(v, " : ", s.AttrOr("content", ""))
  88. // })
  89. // doc.Find(cAuthor).Each(func(index int, element *goquery.Selection) {
  90. // link, _ := element.Attr("href")
  91. // fmt.Println("====================")
  92. // fmt.Println("AuthorName: ", element.Text())
  93. // fmt.Printf("AuthorLink: %s\n", link)
  94. // })
  95. // vdos := []string{}
  96. // doc.Find(cVideo).Each(func(index int, element *goquery.Selection) {
  97. // embed, _ := element.Attr("src")
  98. // vdos = append(vdos, embed)
  99. // })
  100. // fmt.Printf("ItemVideos: %s\n", vdos)
  101. imgs := []string{}
  102. doc.Find(cItemImages).Each(func(index int, element *goquery.Selection) {
  103. img, _ := element.Attr("href")
  104. imgs = append(imgs, img)
  105. })
  106. fmt.Println("ItemImages: ", imgs)
  107. }
  108. func findEmail(body string, doms string) (emails []string) {
  109. r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
  110. emails = append(emails, r.FindStringSubmatch(body)...)
  111. return
  112. }