main.go 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. "net/http"
  6. "regexp"
  7. "strings"
  8. "github.com/PuerkitoBio/goquery"
  9. )
  10. func main() {
  11. // Wordpress
  12. // url := "https://addand.kr/shop/new-%ed%95%9c-%ea%b6%8c%ec%9c%bc%eb%a1%9c-%eb%81%9d%eb%82%98%eb%8a%94-%eb%85%b8%ec%85%98/"
  13. url := "https://seoulknit.com/shop/collar-half-sleeved-top_blue/"
  14. // Send an HTTP GET request to the URL
  15. response, err := http.Get(url)
  16. if err != nil {
  17. log.Fatal(err)
  18. }
  19. defer response.Body.Close()
  20. doc, err := goquery.NewDocumentFromReader(response.Body)
  21. if err != nil {
  22. log.Fatal(err)
  23. }
  24. // These will the value of PDP parsing structures
  25. cItemName := ".product_title"
  26. cShortDesc := ".woocommerce-product-details__short-description"
  27. cItemCategory := ".posted_in"
  28. cItemTags := ".tagged_as"
  29. cItemImages := ".woocommerce-product-gallery__image img"
  30. cItemTextDesc := "div.woocommerce-tabs"
  31. // cItemOptions := "select#rating option"
  32. cItemPrice := ".summary>.price .woocommerce-Price-amount.amount"
  33. cEmail := "footer"
  34. cRating := ".star-rating .rating"
  35. cAuthor := ".product-brand a"
  36. cVideo := "iframe"
  37. // Use the Find method to select elements that match the css selector
  38. // doc.Find(cItemName).Each(func(index int, element *goquery.Selection) {
  39. // // Extract the text associated with the selected element
  40. // text := element.Text()
  41. // fmt.Printf("Text associated with %s: %s\n", cItemName, text)
  42. // })
  43. fmt.Println("ItemName: ", doc.Find(cItemName).First().Text())
  44. fmt.Println("ShortDesc: ", doc.Find(cShortDesc).First().Text())
  45. fmt.Println("ItemCategory: ", doc.Find(cItemCategory).First().Text())
  46. fmt.Println("ItemTags: ", doc.Find(cItemTags).First().Text())
  47. fmt.Println("TextDesc: ", doc.Find(cItemTextDesc).First().Text())
  48. fmt.Println("ItemPice: ", doc.Find(cItemPrice).First().Text())
  49. fmt.Println("Email: ", findEmail(doc.Find(cEmail).First().Text(), ""))
  50. fmt.Println("Rating : ", doc.Find(cRating).First().Text())
  51. doc.Find("link ~ meta").Each(func(i int, s *goquery.Selection) {
  52. v, _ := s.Attr("property")
  53. fmt.Println(v, " : ", s.AttrOr("content", ""))
  54. })
  55. fmt.Println("Author : ")
  56. doc.Find(cAuthor).Each(func(index int, element *goquery.Selection) {
  57. link, _ := element.Attr("href")
  58. fmt.Println("====================")
  59. fmt.Println("AuthorName: ", element.Text())
  60. fmt.Printf("AuthorLink: %s\n", link)
  61. })
  62. vdos := []string{}
  63. doc.Find(cVideo).Each(func(index int, element *goquery.Selection) {
  64. embed, _ := element.Attr("src")
  65. vdos = append(vdos, embed)
  66. })
  67. fmt.Printf("ItemVideos: %s\n", vdos)
  68. imgs := []string{}
  69. fmt.Println("ItemImages: ")
  70. doc.Find(cItemImages).Each(func(index int, element *goquery.Selection) {
  71. pattern := `(_[0-9]+x[0-9]+|-[0-9]+x[0-9]+)$`
  72. img, _ := element.Attr("src")
  73. imgChop, lefter := ChopPath(img)
  74. im := strings.Split(imgChop, ".")
  75. imgFile := im[0]
  76. imgExt := im[1]
  77. // fmt.Println("imgFile", imgFile)
  78. // fmt.Println("imgExt", imgExt)
  79. // Compile the regular expression
  80. re := regexp.MustCompile(pattern)
  81. // Find the match at the end of the string
  82. matches := re.FindStringSubmatch(imgFile)
  83. if len(matches) > 1 {
  84. substring := matches[1]
  85. imgFile = strings.Replace(imgFile, substring, "", -1)
  86. }
  87. imgs = append(imgs, img[0:lefter+1]+imgFile+"."+imgExt)
  88. fmt.Println("", img[0:lefter+1]+imgFile+"."+imgExt)
  89. fmt.Println("", img)
  90. })
  91. }
  92. func findEmail(body string, doms string) (emails []string) {
  93. r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
  94. emails = append(emails, r.FindStringSubmatch(body)...)
  95. return
  96. }
  97. // return the source filename after the last slash
  98. func ChopPath(original string) (string, int) {
  99. i := strings.LastIndex(original, "/")
  100. if i == -1 {
  101. return original, i
  102. } else {
  103. return original[i+1:], i
  104. }
  105. }