package parser import ( "encoding/json" "fmt" "kkscrap-go/controllers/scraper/common" util "kkscrap-go/controllers/scraper/util" wp "kkscrap-go/controllers/scraper/wordpress" // model "kkscrap-go/models" modelv2 "kkscrap-go/model" "regexp" "strconv" "strings" "github.com/PuerkitoBio/goquery" triphtmltags "github.com/denisbrodbeck/striphtmltags" ) func Parse(uri string, c modelv2.MatchingConfig) (item modelv2.ItemInfo) { body, err := util.Get(uri) if !util.CheckError(err) { return } item.Language = common.GetLanguage(body) doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) if !util.CheckError(err) { return } doc.Find("meta").Each(func(i int, s *goquery.Selection) { v, _ := s.Attr("property") if v == "og:description" { item.ShortDesc = s.AttrOr("content", "") } else if v == "og:title" { item.ItemName = s.AttrOr("content", "") } else if v == "product:price:amount" { item.SalesPrice = util.GetFloat32(s.AttrOr("content", "")) } else if v == "product:price:currency" { item.Currency = s.AttrOr("content", "") } }) p := getProduct(body) // item.Images = append(item.Images, p.Image) if len(p.Offers) > 0 { item.SalesPrice = util.GetFloat32(p.Offers[0].Price) } item.Sku = fmt.Sprintf("%v", p.Sku) images := getImages(body, c.Images) item.Emails = common.GetEmails(body) item.Emails = append(item.Emails, findEmail(body, c.Email)...) item.Images = append(item.Images, images...) if c.OriginDesc != "" { item.OriginDesc = triphtmltags.StripTags(getContext(body, c.OriginDesc)) } if c.ShortDesc != "" && item.ShortDesc == "" { item.ShortDesc = triphtmltags.StripTags(getContext(body, c.ShortDesc)) } if c.TextDesc != "" { item.TextDesc = triphtmltags.StripTags(getContext(body, c.TextDesc)) } item.ItemCategory = getCategories(body, c.Category) item.ItemName = getName(body, c.Name) price, _ := strconv.ParseFloat(getPrice(body, c.Price), 32) if item.SalesPrice == 0 && getPrice(body, c.Price) != "" { item.SalesPrice = float32(price) } item.Options = append(item.Options, getOptions(body, c.Options)...) return } func getProduct(body string) (ret wp.WordPressProduct) { jsonStr := getJson(body) if idx := strings.Index(jsonStr, "@graph"); idx >= 0 { p := wp.WordPressItem{} json.Unmarshal([]byte(jsonStr), &p) ret = p.Graph[1] } else { json.Unmarshal([]byte(jsonStr), &ret) } return } func getJson(body string) string { idx := strings.Index(body, "") body = body[:idx] return body } func getPrice(body string, finder string) (ret string) { re := regexp.MustCompile(`[-]?\d[\d,]*[\.]?[\d{2}]*`) doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) util.CheckError(err) splitFinder := strings.Split(finder, ",") for _, v := range splitFinder { s := doc.Find(v) s.Each(func(i int, selection *goquery.Selection) { if i == 0 { submatchall := re.FindAllString(selection.Text(), -1) for key, element := range submatchall { if key == 0 { ret = strings.ReplaceAll(element, ",", "") } } } }) if ret != "" { break } } return } func getImages(body string, finders string) (ret []string) { for _, finder := range strings.Split(finders, ",") { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(body)) doc.Find(finder).Each(func(i int, s *goquery.Selection) { if src, ok := s.Attr("src"); ok { ret = append(ret, src) } else if src, ok := s.Attr("href"); ok { ret = append(ret, src) } }) } return } func getOptions(body string, finder string) (rets []modelv2.Option) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) util.CheckError(err) skippedFistOption := true doc.Find(finder).Each(func(i int, selection *goquery.Selection) { if skippedFistOption { ret := modelv2.Option{} ret.Choices = append(ret.Choices, modelv2.Choice{ Name: selection.Text(), }) rets = append(rets, ret) } }) return } func getDesc(body string, finder string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) util.CheckError(err) s := doc.Find(finder) html := s.Text() return html } func getCategories(body string, finders string) (catss []string) { doms := strings.Split(finders, ",") for _, value := range doms { cats := getDesc(body, value) idx := strings.Index(cats, ":") if idx > 0 { cats = strings.TrimSpace(cats[idx+1:]) } if strings.Contains(cats, "\n") { cats = strings.ReplaceAll(cats, "\n", "/") } r := []string{} if strings.Contains(cats, ",") { r = strings.Split(cats, ",") } else if strings.Contains(cats, "/") { r = strings.Split(cats, "/") } for _, value := range r { if strings.TrimSpace(value) != "" { catss = append(catss, strings.TrimSpace(value)) } } // catss = append(catss, c) } catss = util.Unique(catss) return } func getName(body string, names string) (str string) { splitNames := strings.Split(names, ",") for _, v := range splitNames { if name := getDesc(body, v); name != "" { str = name break } } return } func getContext(body string, doms string) (str string) { splitDoms := strings.Split(doms, ",") for _, v := range splitDoms { str = str + getDesc(body, v) } return } func findEmail(body string, doms string) (emails []string) { r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`) splitDoms := strings.Split(doms, ",") for _, v := range splitDoms { str := getDesc(body, v) emails = append(emails, r.FindStringSubmatch(str)...) } return }