main.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. "net/http"
  6. "regexp"
  7. "github.com/PuerkitoBio/goquery"
  8. )
  9. func main() {
  10. // Wordpress
  11. // url := "https://addand.kr/shop/new-%ed%95%9c-%ea%b6%8c%ec%9c%bc%eb%a1%9c-%eb%81%9d%eb%82%98%eb%8a%94-%eb%85%b8%ec%85%98/"
  12. url := "https://www.fredperry.com/men/sharp/glitch-chequerboard-cardigan-k6512-q41.html"
  13. url = "https://taiwan.coach.com/cm538-lhslv.html" //Magento
  14. // url= "https://www.elcanto.co.kr" //MakeShop
  15. // url= "https://www.ippngirl.co.kr" //MakeShop
  16. // url= "https://lachinatakorea.com" //Godomall
  17. // url= "https://sf-fd.com" //Godomall
  18. // url= "https://p2u.daboryhost.com" /DaboryShop
  19. // Send an HTTP GET request to the URL
  20. response, err := http.Get(url)
  21. if err != nil {
  22. log.Fatal(err)
  23. }
  24. defer response.Body.Close()
  25. doc, err := goquery.NewDocumentFromReader(response.Body)
  26. if err != nil {
  27. log.Fatal(err)
  28. }
  29. // These will the value of PDP parsing structures
  30. cItemName := ".page-title-wrapper.product>.page-title"
  31. cShortDesc := ".product.attribute.description"
  32. cItemCategory := ".posted_in"
  33. cItemTags := ".tagged_as"
  34. cItemImages := ".product-item-info a"
  35. cItemTextDesc := ".product-info-descriptions"
  36. // // cItemOptions := "select#rating option"
  37. cItemPrice := ".price-wrapper>.price"
  38. cEmail := "footer"
  39. cRating := ".star-rating .rating"
  40. cAuthor := ".product-brand a"
  41. cVideo := "iframe"
  42. // Use the Find method to select elements that match the css selector
  43. // doc.Find(cItemName).Each(func(index int, element *goquery.Selection) {
  44. // // Extract the text associated with the selected element
  45. // text := element.Text()
  46. // fmt.Printf("Text associated with %s: %s\n", cItemName, text)
  47. // })
  48. doc.Find("meta").Each(func(i int, s *goquery.Selection) {
  49. v, _ := s.Attr("property")
  50. if v == "author" {
  51. fmt.Println("Author : ", s.AttrOr("content", ""))
  52. }
  53. if v == "og:title" {
  54. fmt.Println("ItemName: ", s.AttrOr("content", ""))
  55. }
  56. if v == "og:description" {
  57. fmt.Println("TextDesc: ", s.AttrOr("content", ""))
  58. }
  59. if v == "og:image" {
  60. fmt.Println("Images: ", s.AttrOr("content", ""))
  61. }
  62. // fmt.Println()
  63. })
  64. fmt.Println("ItemName: ", doc.Find(cItemName).First().Text())
  65. fmt.Println("ShortDesc: ", doc.Find(cShortDesc).First().Text())
  66. fmt.Println("ItemCategory: ", doc.Find(cItemCategory).First().Text())
  67. fmt.Println("ItemTags: ", doc.Find(cItemTags).First().Text())
  68. fmt.Println("TextDesc: ", doc.Find(cItemTextDesc).First().Text())
  69. fmt.Println("ItemPice: ", doc.Find(cItemPrice).First().Text())
  70. fmt.Println("Email: ", findEmail(doc.Find(cEmail).First().Text(), ""))
  71. fmt.Println("Rating : ", doc.Find(cRating).First().Text())
  72. fmt.Println("Author : ")
  73. doc.Find(cAuthor).Each(func(index int, element *goquery.Selection) {
  74. link, _ := element.Attr("href")
  75. fmt.Println("====================")
  76. fmt.Println("AuthorName: ", element.Text())
  77. fmt.Printf("AuthorLink: %s\n", link)
  78. })
  79. vdos := []string{}
  80. doc.Find(cVideo).Each(func(index int, element *goquery.Selection) {
  81. embed, _ := element.Attr("src")
  82. vdos = append(vdos, embed)
  83. })
  84. fmt.Printf("ItemVideos: %s\n", vdos)
  85. imgs := []string{}
  86. doc.Find(cItemImages).Each(func(index int, element *goquery.Selection) {
  87. img, _ := element.Attr("href")
  88. imgs = append(imgs, img)
  89. })
  90. fmt.Println("ItemImages: ", imgs)
  91. }
  92. func findEmail(body string, doms string) (emails []string) {
  93. r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
  94. emails = append(emails, r.FindStringSubmatch(body)...)
  95. return
  96. }