123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- package parser
- import (
- "encoding/json"
- "fmt"
- "kkscrap-go/controllers/scraper/common"
- util "kkscrap-go/controllers/scraper/util"
- wp "kkscrap-go/controllers/scraper/wordpress"
- // model "kkscrap-go/models"
- modelv2 "kkscrap-go/model"
- "regexp"
- "strconv"
- "strings"
- "github.com/PuerkitoBio/goquery"
- triphtmltags "github.com/denisbrodbeck/striphtmltags"
- )
- func Parse(uri string, c modelv2.MatchingConfig) (item modelv2.ItemInfo) {
- body, err := util.Get(uri)
- if !util.CheckError(err) {
- return
- }
- item.Language = common.GetLanguage(body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- if !util.CheckError(err) {
- return
- }
- doc.Find("meta").Each(func(i int, s *goquery.Selection) {
- v, _ := s.Attr("property")
- if v == "og:description" {
- item.ShortDesc = s.AttrOr("content", "")
- } else if v == "og:title" {
- item.ItemName = s.AttrOr("content", "")
- } else if v == "product:price:amount" {
- item.SalesPrice = util.GetFloat32(s.AttrOr("content", ""))
- } else if v == "product:price:currency" {
- item.Currency = s.AttrOr("content", "")
- }
- })
- p := getProduct(body)
- // item.Images = append(item.Images, p.Image)
- if len(p.Offers) > 0 {
- item.SalesPrice = util.GetFloat32(p.Offers[0].Price)
- }
- item.Sku = fmt.Sprintf("%v", p.Sku)
- images := getImages(body, c.Images)
- item.Emails = common.GetEmails(body)
- item.Emails = append(item.Emails, findEmail(body, c.Email)...)
- item.Images = append(item.Images, images...)
- if c.OriginDesc != "" {
- item.OriginDesc = triphtmltags.StripTags(getContext(body, c.OriginDesc))
- }
- if c.ShortDesc != "" && item.ShortDesc == "" {
- item.ShortDesc = triphtmltags.StripTags(getContext(body, c.ShortDesc))
- }
- if c.TextDesc != "" {
- item.TextDesc = triphtmltags.StripTags(getContext(body, c.TextDesc))
- }
- item.ItemCategory = getCategories(body, c.Category)
- item.ItemName = getName(body, c.Name)
- price, _ := strconv.ParseFloat(getPrice(body, c.Price), 32)
- if item.SalesPrice == 0 && getPrice(body, c.Price) != "" {
- item.SalesPrice = float32(price)
- }
- item.Options = append(item.Options, getOptions(body, c.Options)...)
- return
- }
- func getProduct(body string) (ret wp.WordPressProduct) {
- jsonStr := getJson(body)
- if idx := strings.Index(jsonStr, "@graph"); idx >= 0 {
- p := wp.WordPressItem{}
- json.Unmarshal([]byte(jsonStr), &p)
- ret = p.Graph[1]
- } else {
- json.Unmarshal([]byte(jsonStr), &ret)
- }
- return
- }
- func getJson(body string) string {
- idx := strings.Index(body, "<script type=\"application/ld+json\">")
- if idx < 0 {
- return ""
- }
- body = body[idx+len("<script type=\"application/ld+json\">"):]
- idx = strings.Index(body, "</script>")
- body = body[:idx]
- return body
- }
- func getPrice(body string, finder string) (ret string) {
- re := regexp.MustCompile(`[-]?\d[\d,]*[\.]?[\d{2}]*`)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- splitFinder := strings.Split(finder, ",")
- for _, v := range splitFinder {
- s := doc.Find(v)
- s.Each(func(i int, selection *goquery.Selection) {
- if i == 0 {
- submatchall := re.FindAllString(selection.Text(), -1)
- for key, element := range submatchall {
- if key == 0 {
- ret = strings.ReplaceAll(element, ",", "")
- }
- }
- }
- })
- if ret != "" {
- break
- }
- }
- return
- }
- func getImages(body string, finders string) (ret []string) {
- for _, finder := range strings.Split(finders, ",") {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(body))
- doc.Find(finder).Each(func(i int, s *goquery.Selection) {
- if src, ok := s.Attr("src"); ok {
- ret = append(ret, src)
- } else if src, ok := s.Attr("href"); ok {
- ret = append(ret, src)
- }
- })
- }
- return
- }
- func getOptions(body string, finder string) (rets []modelv2.Option) {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- skippedFistOption := true
- doc.Find(finder).Each(func(i int, selection *goquery.Selection) {
- if skippedFistOption {
- ret := modelv2.Option{}
- ret.Choices = append(ret.Choices, modelv2.Choice{
- Name: selection.Text(),
- })
- rets = append(rets, ret)
- }
- })
- return
- }
- func getDesc(body string, finder string) string {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- s := doc.Find(finder)
- html := s.Text()
- return html
- }
- func getCategories(body string, finders string) (catss []string) {
- doms := strings.Split(finders, ",")
- for _, value := range doms {
- cats := getDesc(body, value)
- idx := strings.Index(cats, ":")
- if idx > 0 {
- cats = strings.TrimSpace(cats[idx+1:])
- }
- if strings.Contains(cats, "\n") {
- cats = strings.ReplaceAll(cats, "\n", "/")
- }
- r := []string{}
- if strings.Contains(cats, ",") {
- r = strings.Split(cats, ",")
- } else if strings.Contains(cats, "/") {
- r = strings.Split(cats, "/")
- }
- for _, value := range r {
- if strings.TrimSpace(value) != "" {
- catss = append(catss, strings.TrimSpace(value))
- }
- }
- // catss = append(catss, c)
- }
- catss = util.Unique(catss)
- return
- }
- func getName(body string, names string) (str string) {
- splitNames := strings.Split(names, ",")
- for _, v := range splitNames {
- if name := getDesc(body, v); name != "" {
- str = name
- break
- }
- }
- return
- }
- func getContext(body string, doms string) (str string) {
- splitDoms := strings.Split(doms, ",")
- for _, v := range splitDoms {
- str = str + getDesc(body, v)
- }
- return
- }
- func findEmail(body string, doms string) (emails []string) {
- r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
- splitDoms := strings.Split(doms, ",")
- for _, v := range splitDoms {
- str := getDesc(body, v)
- emails = append(emails, r.FindStringSubmatch(str)...)
- }
- return
- }
|