parser.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. package parser
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "kkscrap-go/controllers/scraper/common"
  6. util "kkscrap-go/controllers/scraper/util"
  7. wp "kkscrap-go/controllers/scraper/wordpress"
  8. // model "kkscrap-go/models"
  9. modelv2 "kkscrap-go/model"
  10. "regexp"
  11. "strconv"
  12. "strings"
  13. "github.com/PuerkitoBio/goquery"
  14. triphtmltags "github.com/denisbrodbeck/striphtmltags"
  15. )
  16. func Parse(uri string, c modelv2.MatchingConfig) (item modelv2.ItemInfo) {
  17. body, err := util.Get(uri)
  18. if !util.CheckError(err) {
  19. return
  20. }
  21. item.Language = common.GetLanguage(body)
  22. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  23. if !util.CheckError(err) {
  24. return
  25. }
  26. doc.Find("meta").Each(func(i int, s *goquery.Selection) {
  27. v, _ := s.Attr("property")
  28. if v == "og:description" {
  29. item.ShortDesc = s.AttrOr("content", "")
  30. } else if v == "og:title" {
  31. item.ItemName = s.AttrOr("content", "")
  32. } else if v == "product:price:amount" {
  33. item.SalesPrice = util.GetFloat32(s.AttrOr("content", ""))
  34. } else if v == "product:price:currency" {
  35. item.Currency = s.AttrOr("content", "")
  36. }
  37. })
  38. p := getProduct(body)
  39. // item.Images = append(item.Images, p.Image)
  40. if len(p.Offers) > 0 {
  41. item.SalesPrice = util.GetFloat32(p.Offers[0].Price)
  42. }
  43. item.Sku = fmt.Sprintf("%v", p.Sku)
  44. images := getImages(body, c.Images)
  45. item.Emails = common.GetEmails(body)
  46. item.Emails = append(item.Emails, findEmail(body, c.Email)...)
  47. item.Images = append(item.Images, images...)
  48. if c.OriginDesc != "" {
  49. item.OriginDesc = triphtmltags.StripTags(getContext(body, c.OriginDesc))
  50. }
  51. if c.ShortDesc != "" && item.ShortDesc == "" {
  52. item.ShortDesc = triphtmltags.StripTags(getContext(body, c.ShortDesc))
  53. }
  54. if c.TextDesc != "" {
  55. item.TextDesc = triphtmltags.StripTags(getContext(body, c.TextDesc))
  56. }
  57. item.ItemCategory = getCategories(body, c.Category)
  58. item.ItemName = getName(body, c.Name)
  59. price, _ := strconv.ParseFloat(getPrice(body, c.Price), 32)
  60. if item.SalesPrice == 0 && getPrice(body, c.Price) != "" {
  61. item.SalesPrice = float32(price)
  62. }
  63. item.Options = append(item.Options, getOptions(body, c.Options)...)
  64. return
  65. }
  66. func getProduct(body string) (ret wp.WordPressProduct) {
  67. jsonStr := getJson(body)
  68. if idx := strings.Index(jsonStr, "@graph"); idx >= 0 {
  69. p := wp.WordPressItem{}
  70. json.Unmarshal([]byte(jsonStr), &p)
  71. ret = p.Graph[1]
  72. } else {
  73. json.Unmarshal([]byte(jsonStr), &ret)
  74. }
  75. return
  76. }
  77. func getJson(body string) string {
  78. idx := strings.Index(body, "<script type=\"application/ld+json\">")
  79. if idx < 0 {
  80. return ""
  81. }
  82. body = body[idx+len("<script type=\"application/ld+json\">"):]
  83. idx = strings.Index(body, "</script>")
  84. body = body[:idx]
  85. return body
  86. }
  87. func getPrice(body string, finder string) (ret string) {
  88. re := regexp.MustCompile(`[-]?\d[\d,]*[\.]?[\d{2}]*`)
  89. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  90. util.CheckError(err)
  91. splitFinder := strings.Split(finder, ",")
  92. for _, v := range splitFinder {
  93. s := doc.Find(v)
  94. s.Each(func(i int, selection *goquery.Selection) {
  95. if i == 0 {
  96. submatchall := re.FindAllString(selection.Text(), -1)
  97. for key, element := range submatchall {
  98. if key == 0 {
  99. ret = strings.ReplaceAll(element, ",", "")
  100. }
  101. }
  102. }
  103. })
  104. if ret != "" {
  105. break
  106. }
  107. }
  108. return
  109. }
  110. func getImages(body string, finders string) (ret []string) {
  111. for _, finder := range strings.Split(finders, ",") {
  112. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(body))
  113. doc.Find(finder).Each(func(i int, s *goquery.Selection) {
  114. if src, ok := s.Attr("src"); ok {
  115. ret = append(ret, src)
  116. } else if src, ok := s.Attr("href"); ok {
  117. ret = append(ret, src)
  118. }
  119. })
  120. }
  121. return
  122. }
  123. func getOptions(body string, finder string) (rets []modelv2.Option) {
  124. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  125. util.CheckError(err)
  126. skippedFistOption := true
  127. doc.Find(finder).Each(func(i int, selection *goquery.Selection) {
  128. if skippedFistOption {
  129. ret := modelv2.Option{}
  130. ret.Choices = append(ret.Choices, modelv2.Choice{
  131. Name: selection.Text(),
  132. })
  133. rets = append(rets, ret)
  134. }
  135. })
  136. return
  137. }
  138. func getDesc(body string, finder string) string {
  139. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  140. util.CheckError(err)
  141. s := doc.Find(finder)
  142. html := s.Text()
  143. return html
  144. }
  145. func getCategories(body string, finders string) (catss []string) {
  146. doms := strings.Split(finders, ",")
  147. for _, value := range doms {
  148. cats := getDesc(body, value)
  149. idx := strings.Index(cats, ":")
  150. if idx > 0 {
  151. cats = strings.TrimSpace(cats[idx+1:])
  152. }
  153. if strings.Contains(cats, "\n") {
  154. cats = strings.ReplaceAll(cats, "\n", "/")
  155. }
  156. r := []string{}
  157. if strings.Contains(cats, ",") {
  158. r = strings.Split(cats, ",")
  159. } else if strings.Contains(cats, "/") {
  160. r = strings.Split(cats, "/")
  161. }
  162. for _, value := range r {
  163. if strings.TrimSpace(value) != "" {
  164. catss = append(catss, strings.TrimSpace(value))
  165. }
  166. }
  167. // catss = append(catss, c)
  168. }
  169. catss = util.Unique(catss)
  170. return
  171. }
  172. func getName(body string, names string) (str string) {
  173. splitNames := strings.Split(names, ",")
  174. for _, v := range splitNames {
  175. if name := getDesc(body, v); name != "" {
  176. str = name
  177. break
  178. }
  179. }
  180. return
  181. }
  182. func getContext(body string, doms string) (str string) {
  183. splitDoms := strings.Split(doms, ",")
  184. for _, v := range splitDoms {
  185. str = str + getDesc(body, v)
  186. }
  187. return
  188. }
  189. func findEmail(body string, doms string) (emails []string) {
  190. r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
  191. splitDoms := strings.Split(doms, ",")
  192. for _, v := range splitDoms {
  193. str := getDesc(body, v)
  194. emails = append(emails, r.FindStringSubmatch(str)...)
  195. }
  196. return
  197. }