123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- package wordpress
- import (
- "encoding/json"
- "fmt"
- "kkscrap-go/controllers/scraper/common"
- "kkscrap-go/controllers/scraper/util"
- "kkscrap-go/model"
- "strings"
- "github.com/PuerkitoBio/goquery"
- )
- func Parse(body string, item *model.ItemInfo) {
- p := getProduct(body)
- // if p.Image != "" {
- // item.Images = append(item.Images, p.Image)
- // }
- item.Images = append(item.Images, getImages(body)...)
- if p.Offers != nil && len(p.Offers) > 0 {
- item.SalesPrice = util.GetFloat32(p.Offers[0].Price)
- item.Currency = p.Offers[0].Pricecurrency
- }
- item.Sku = fmt.Sprintf("%v", p.Sku)
- item.ShortDesc = p.Description
- item.OriginDesc = getProductDescription(body)
- item.TextDesc = getTextDesc(body)
- item.ItemName = p.Name
- item.Language = common.GetLanguage(body)
- item.Emails = common.GetEmails(body)
- item.Options = append(item.Options, getOptions(body))
- cats := getCategories(body)
- catss := []string{}
- if strings.Contains(cats, ",") {
- catss = strings.Split(cats, ",")
- } else if strings.Contains(cats, "/") {
- catss = strings.Split(cats, "/")
- }
- for _, v := range catss {
- item.ItemCategory = append(item.ItemCategory, strings.TrimSpace(v))
- }
- return
- }
- func getProductDescription(body string) string {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- s := doc.Find("div.woocommerce-Tabs-panel--description")
- //s.Each(func(i int, selection *goquery.Selection) {
- // log.Println(selection.Html())
- //})
- html, err := s.Html()
- if err != nil {
- return ""
- }
- return html
- }
- func getTextDesc(body string) string {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- text := doc.Find("div.woocommerce-tabs").Text()
- return text
- }
- func getProduct(body string) (ret WordPressProduct) {
- jsonStr := getJson(body)
- if idx := strings.Index(jsonStr, "@graph"); idx >= 0 {
- p := WordPressItem{}
- json.Unmarshal([]byte(jsonStr), &p)
- ret = p.Graph[1]
- } else {
- json.Unmarshal([]byte(jsonStr), &ret)
- }
- return
- }
- func getJson(body string) string {
- idx := strings.Index(body, "<script type=\"application/ld+json\">")
- if idx < 0 {
- return ""
- }
- body = body[idx+len("<script type=\"application/ld+json\">"):]
- idx = strings.Index(body, "</script>")
- body = body[:idx]
- return body
- }
- func getImages(body string) (ret []string) {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- //doc.Find("img.wp-post-image").Each(func(i int, s *goquery.Selection) {
- // if src, ok := s.Attr("src"); ok {
- // width, _ := s.Attr("width")
- // height, _ := s.Attr("height")
- // w, _ := strconv.ParseInt(width, 10, 64)
- // h, _ := strconv.ParseInt(height, 10, 64)
- // //log.Println(src, width, height)
- // ret = append(ret, model.Image{
- // Path: src,
- // Width: int(w),
- // Height: int(h),
- // })
- // }
- //})
- //sort.Slice(ret, func(i, j int) bool {
- // return ret[i].Width*ret[i].Height > ret[j].Width*ret[j].Height
- //})
- //ret = ret[:1]
- // re := regexp.MustCompile(`-\d+x\d+\.`)
- doc.Find("figure.woocommerce-product-gallery__wrapper div a").Each(func(i int, s *goquery.Selection) {
- if src, ok := s.Attr("href"); ok {
- ret = append(ret, src)
- }
- })
- return
- }
- func getPrice(body string) (ret string) {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- s := doc.Find("span.woocommerce-Price-amount.amount")
- s.Each(func(i int, selection *goquery.Selection) {
- if i == 0 {
- ret = selection.Text()
- }
- })
- //log.Println(s.Nodes[0].FirstChild.FirstChild.Data)
- //log.Println(s.Nodes[0].FirstChild.NextSibling.Data)
- return
- }
- func getCategories(body string) (ret string) {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- s := doc.Find("span.posted_in")
- ret = s.Text()
- idx := strings.Index(ret, ":")
- if idx > 0 {
- ret = strings.TrimSpace(ret[idx+1:])
- }
- //log.Println(s.Nodes[0].FirstChild.FirstChild.Data)
- //log.Println(s.Nodes[0].FirstChild.NextSibling.Data)
- return
- }
- func getOptions(body string) (ret model.Option) {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- s := doc.Find("body select")
- s.Each(func(i int, selection *goquery.Selection) {
- skippedFistOption := false
- v, _ := selection.Attr("name")
- if v != "rating" {
- selection.Find("option").Each(func(i int, selection *goquery.Selection) {
- if skippedFistOption {
- ret.Choices = append(ret.Choices, model.Choice{
- Name: selection.Text(),
- })
- } else {
- skippedFistOption = true
- }
- })
- }
- })
- return
- }
|