parser.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. package wordpress
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "github.com/PuerkitoBio/goquery"
  6. "kkscrap-go/controllers/scraper/common"
  7. "kkscrap-go/controllers/scraper/util"
  8. "kkscrap-go/model"
  9. "regexp"
  10. "strings"
  11. )
  12. func Parse(uri string, item *model.ItemInfo) {
  13. body, err := util.Get(uri)
  14. util.CheckError(err)
  15. p := getProduct(body)
  16. if p.Image != "" {
  17. item.Images = append(item.Images, p.Image)
  18. }
  19. item.Images = append(item.Images, getImages(body)...)
  20. if p.Offers != nil && len(p.Offers) > 0 {
  21. item.SalesPrice = util.GetFloat32(p.Offers[0].Price)
  22. item.Currency = p.Offers[0].Pricecurrency
  23. }
  24. item.Sku = fmt.Sprintf("%v", p.Sku)
  25. item.ShortDesc = p.Description
  26. item.OriginDesc = getProductDescription(body)
  27. item.TextDesc = getTextDesc(body)
  28. item.ItemName = p.Name
  29. item.Language = common.GetLanguage(body)
  30. item.Emails = common.GetEmails(body)
  31. item.Options = append(item.Options, getOptions(body))
  32. return
  33. }
  34. func getProductDescription(body string) string {
  35. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  36. util.CheckError(err)
  37. s := doc.Find("div.woocommerce-Tabs-panel--description")
  38. //s.Each(func(i int, selection *goquery.Selection) {
  39. // log.Println(selection.Html())
  40. //})
  41. html, err := s.Html()
  42. if err != nil {
  43. return ""
  44. }
  45. return html
  46. }
  47. func getTextDesc(body string) string {
  48. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  49. util.CheckError(err)
  50. text := doc.Find("div.woocommerce-tabs").Text()
  51. return text
  52. }
  53. func getProduct(body string) (ret WordPressProduct) {
  54. jsonStr := getJson(body)
  55. if idx := strings.Index(jsonStr, "@graph"); idx >= 0 {
  56. p := WordPressItem{}
  57. json.Unmarshal([]byte(jsonStr), &p)
  58. ret = p.Graph[1]
  59. } else {
  60. json.Unmarshal([]byte(jsonStr), &ret)
  61. }
  62. return
  63. }
  64. func getJson(body string) string {
  65. idx := strings.Index(body, "<script type=\"application/ld+json\">")
  66. if idx < 0 {
  67. return ""
  68. }
  69. body = body[idx+len("<script type=\"application/ld+json\">"):]
  70. idx = strings.Index(body, "</script>")
  71. body = body[:idx]
  72. return body
  73. }
  74. func getImages(body string) (ret []string) {
  75. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  76. util.CheckError(err)
  77. //doc.Find("img.wp-post-image").Each(func(i int, s *goquery.Selection) {
  78. // if src, ok := s.Attr("src"); ok {
  79. // width, _ := s.Attr("width")
  80. // height, _ := s.Attr("height")
  81. // w, _ := strconv.ParseInt(width, 10, 64)
  82. // h, _ := strconv.ParseInt(height, 10, 64)
  83. // //log.Println(src, width, height)
  84. // ret = append(ret, model.Image{
  85. // Path: src,
  86. // Width: int(w),
  87. // Height: int(h),
  88. // })
  89. // }
  90. //})
  91. //sort.Slice(ret, func(i, j int) bool {
  92. // return ret[i].Width*ret[i].Height > ret[j].Width*ret[j].Height
  93. //})
  94. //ret = ret[:1]
  95. re := regexp.MustCompile(`-\d+x\d+\.`)
  96. doc.Find("figure.woocommerce-product-gallery__wrapper div").Each(func(i int, s *goquery.Selection) {
  97. if src, ok := s.Attr("data-thumb"); ok {
  98. newImgUrl := re.ReplaceAllString(src, ".")
  99. ret = append(ret, newImgUrl)
  100. }
  101. })
  102. return
  103. }
  104. func getPrice(body string) (ret string) {
  105. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  106. util.CheckError(err)
  107. s := doc.Find("span.woocommerce-Price-amount.amount")
  108. s.Each(func(i int, selection *goquery.Selection) {
  109. if i == 0 {
  110. ret = selection.Text()
  111. }
  112. })
  113. //log.Println(s.Nodes[0].FirstChild.FirstChild.Data)
  114. //log.Println(s.Nodes[0].FirstChild.NextSibling.Data)
  115. return
  116. }
  117. func getCategories(body string) (ret string) {
  118. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  119. util.CheckError(err)
  120. s := doc.Find("span.posted_in")
  121. ret = s.Text()
  122. idx := strings.Index(ret, ":")
  123. if idx > 0 {
  124. ret = strings.TrimSpace(ret[idx+1:])
  125. }
  126. //log.Println(s.Nodes[0].FirstChild.FirstChild.Data)
  127. //log.Println(s.Nodes[0].FirstChild.NextSibling.Data)
  128. return
  129. }
  130. func getOptions(body string) (ret model.Option) {
  131. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  132. util.CheckError(err)
  133. s := doc.Find("body select")
  134. s.Each(func(i int, selection *goquery.Selection) {
  135. skippedFistOption := false
  136. v, _ := selection.Attr("name")
  137. if v != "rating" {
  138. selection.Find("option").Each(func(i int, selection *goquery.Selection) {
  139. if skippedFistOption {
  140. ret.Choices = append(ret.Choices, model.Choice{
  141. Name: selection.Text(),
  142. })
  143. } else {
  144. skippedFistOption = true
  145. }
  146. })
  147. }
  148. })
  149. return
  150. }