package common import ( "kkscrap-go/model" "regexp" "strings" util "kkscrap-go/controllers/scraper/util" "github.com/PuerkitoBio/goquery" ) func GetLanguage(body string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) util.CheckError(err) ret, _ := doc.Find("html").Attr("lang") return ret } func GetEmails(body string) (ret []string) { // reg := regexp.MustCompile("[a-zA-Z0-9.!#$%&*+\\-/=?^_`{|}~]+@[a-z0-9.\\-]+\\.[a-z]+") reg, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`) // fmt.Println(r.FindStringSubmatch("주식회사 서울니트디자인 대표: 윤근영 사업자등록번호: 129-86-89635 통신판매업번호 : 2021-성남중원-0367 주소: 경기도 성남시 중원구 갈마치로 215, 금강펜테리움아이티타워 A동 309호(상대원동) 전화: 031-730-0525 이메일: SOYOON58@NAVER.COM COPYRIGHT(C) WWW.SEOULKNIT.COM. ALL RIGHTS RESERVED. HOSTING BY CODEM")) ss := reg.FindAllStringSubmatch(body, -1) m := make(map[string]bool) for _, s := range ss { e := s[0] if e == "support@crema.me" || e == "support@snapvi.co.kr" || strings.HasPrefix(e, "/") || strings.HasPrefix(e, "http://") || strings.HasPrefix(e, "https://") || strings.HasSuffix(e, ".png") || strings.HasSuffix(e, ".jpg") || strings.HasSuffix(e, ".js") { continue } m[s[0]] = true } for k, _ := range m { ret = append(ret, k) } return ret } func Parse(uri string, item *model.ItemInfo) { body, err := util.Get(uri) util.CheckError(err) item.Language = GetLanguage(body) doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) util.CheckError(err) doc.Find("meta").Each(func(i int, s *goquery.Selection) { v, _ := s.Attr("property") if v == "og:description" { item.ShortDesc = s.AttrOr("content", "") } else if v == "og:title" { item.ItemName = s.AttrOr("content", "") } else if v == "og:image" { item.Images = append(item.Images, s.AttrOr("content", "")) } else if v == "product:price:amount" { item.SalesPrice = util.GetFloat32(s.AttrOr("content", "")) } else if v == "product:price:currency" { item.Currency = s.AttrOr("content", "") } }) return }