12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- package common
- import (
- "kkscrap-go/model"
- "regexp"
- "strings"
- util "kkscrap-go/controllers/scraper/util"
- "github.com/PuerkitoBio/goquery"
- )
- func GetLanguage(body string) string {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- ret, _ := doc.Find("html").Attr("lang")
- return ret
- }
- func GetEmails(body string) (ret []string) {
- reg := regexp.MustCompile("[a-zA-Z0-9.!#$%&*+\\-/=?^_`{|}~]+@[a-z0-9.\\-]+\\.[a-z]+")
- ss := reg.FindAllStringSubmatch(body, -1)
- m := make(map[string]bool)
- for _, s := range ss {
- e := s[0]
- if e == "support@crema.me" || e == "support@snapvi.co.kr" || strings.HasPrefix(e, "/") || strings.HasPrefix(e, "http://") ||
- strings.HasPrefix(e, "https://") || strings.HasSuffix(e, ".png") || strings.HasSuffix(e, ".jpg") || strings.HasSuffix(e, ".js") {
- continue
- }
- m[s[0]] = true
- }
- for k, _ := range m {
- ret = append(ret, k)
- }
- return ret
- }
- func Parse(uri string, item *model.ItemInfo) {
- body, err := util.Get(uri)
- util.CheckError(err)
- item.Language = GetLanguage(body)
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
- util.CheckError(err)
- doc.Find("meta").Each(func(i int, s *goquery.Selection) {
- v, _ := s.Attr("property")
- if v == "og:description" {
- item.ShortDesc = s.AttrOr("content", "")
- } else if v == "og:title" {
- item.ItemName = s.AttrOr("content", "")
- } else if v == "og:image" {
- item.Images = append(item.Images, s.AttrOr("content", ""))
- } else if v == "product:price:amount" {
- item.SalesPrice = util.GetFloat32(s.AttrOr("content", ""))
- } else if v == "product:price:currency" {
- item.Currency = s.AttrOr("content", "")
- }
- })
- return
- }
|