parser.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. package wordpress
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "kkscrap-go/controllers/scraper/common"
  6. "kkscrap-go/model"
  7. "strings"
  8. util "kkscrap-go/controllers/scraper/util"
  9. "github.com/PuerkitoBio/goquery"
  10. )
  11. func Parse(uri string, item *model.ItemInfo) {
  12. body, err := util.Get(uri)
  13. util.CheckError(err)
  14. p := getProduct(body)
  15. item.Images = append(item.Images, p.Image)
  16. item.Images = append(item.Images, getImages(body)...)
  17. item.SalesPrice = util.GetFloat32(p.Offers[0].Price)
  18. item.Sku = fmt.Sprintf("%v", p.Sku)
  19. item.ShortDesc = p.Description
  20. item.OriginDesc = getDesc(body)
  21. item.TextDesc = getTextDesc(body)
  22. item.ItemName = p.Name
  23. item.Currency = p.Offers[0].Pricecurrency
  24. item.Language = common.GetLanguage(body)
  25. item.Emails = common.GetEmails(body)
  26. item.Options = append(item.Options, getOptions(body))
  27. return
  28. }
  29. func getDesc(body string) string {
  30. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  31. util.CheckError(err)
  32. s := doc.Find("div.woocommerce-tabs")
  33. //s.Each(func(i int, selection *goquery.Selection) {
  34. // log.Println(selection.Html())
  35. //})
  36. html, err := s.Html()
  37. if err != nil {
  38. return ""
  39. }
  40. return html
  41. }
  42. func getTextDesc(body string) string {
  43. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  44. util.CheckError(err)
  45. text := doc.Find("div.woocommerce-tabs").Text()
  46. return text
  47. }
  48. func getProduct(body string) (ret WordPressProduct) {
  49. jsonStr := getJson(body)
  50. if idx := strings.Index(jsonStr, "@graph"); idx >= 0 {
  51. p := WordPressItem{}
  52. json.Unmarshal([]byte(jsonStr), &p)
  53. ret = p.Graph[1]
  54. } else {
  55. json.Unmarshal([]byte(jsonStr), &ret)
  56. }
  57. return
  58. }
  59. func getJson(body string) string {
  60. idx := strings.Index(body, "<script type=\"application/ld+json\">")
  61. if idx < 0 {
  62. return ""
  63. }
  64. body = body[idx+len("<script type=\"application/ld+json\">"):]
  65. idx = strings.Index(body, "</script>")
  66. body = body[:idx]
  67. return body
  68. }
  69. func getImages(body string) (ret []string) {
  70. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  71. util.CheckError(err)
  72. //doc.Find("img.wp-post-image").Each(func(i int, s *goquery.Selection) {
  73. // if src, ok := s.Attr("src"); ok {
  74. // width, _ := s.Attr("width")
  75. // height, _ := s.Attr("height")
  76. // w, _ := strconv.ParseInt(width, 10, 64)
  77. // h, _ := strconv.ParseInt(height, 10, 64)
  78. // //log.Println(src, width, height)
  79. // ret = append(ret, model.Image{
  80. // Path: src,
  81. // Width: int(w),
  82. // Height: int(h),
  83. // })
  84. // }
  85. //})
  86. //sort.Slice(ret, func(i, j int) bool {
  87. // return ret[i].Width*ret[i].Height > ret[j].Width*ret[j].Height
  88. //})
  89. //ret = ret[:1]
  90. doc.Find("figure.woocommerce-product-gallery__wrapper div").Each(func(i int, s *goquery.Selection) {
  91. if src, ok := s.Attr("data-thumb"); ok {
  92. ret = append(ret, src)
  93. }
  94. })
  95. return
  96. }
  97. func getPrice(body string) (ret string) {
  98. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  99. util.CheckError(err)
  100. s := doc.Find("span.woocommerce-Price-amount.amount")
  101. s.Each(func(i int, selection *goquery.Selection) {
  102. if i == 0 {
  103. ret = selection.Text()
  104. }
  105. })
  106. //log.Println(s.Nodes[0].FirstChild.FirstChild.Data)
  107. //log.Println(s.Nodes[0].FirstChild.NextSibling.Data)
  108. return
  109. }
  110. func getCategories(body string) (ret string) {
  111. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  112. util.CheckError(err)
  113. s := doc.Find("span.posted_in")
  114. ret = s.Text()
  115. idx := strings.Index(ret, ":")
  116. if idx > 0 {
  117. ret = strings.TrimSpace(ret[idx+1:])
  118. }
  119. //log.Println(s.Nodes[0].FirstChild.FirstChild.Data)
  120. //log.Println(s.Nodes[0].FirstChild.NextSibling.Data)
  121. return
  122. }
  123. func getOptions(body string) (ret model.Option) {
  124. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  125. util.CheckError(err)
  126. s := doc.Find("body select")
  127. s.Each(func(i int, selection *goquery.Selection) {
  128. skippedFistOption := false
  129. v, _ := selection.Attr("name")
  130. if v != "rating" {
  131. selection.Find("option").Each(func(i int, selection *goquery.Selection) {
  132. if skippedFistOption {
  133. ret.Choices = append(ret.Choices, model.Choice{
  134. Name: selection.Text(),
  135. })
  136. } else {
  137. skippedFistOption = true
  138. }
  139. })
  140. }
  141. })
  142. return
  143. }