parser_common.go 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. package common
  2. import (
  3. "kkscrap-go/model"
  4. "regexp"
  5. "strings"
  6. util "kkscrap-go/controllers/scraper/util"
  7. "github.com/PuerkitoBio/goquery"
  8. )
  9. func GetLanguage(body string) string {
  10. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  11. util.CheckError(err)
  12. ret, _ := doc.Find("html").Attr("lang")
  13. return ret
  14. }
  15. func GetEmails(body string) (ret []string) {
  16. // reg := regexp.MustCompile("[a-zA-Z0-9.!#$%&*+\\-/=?^_`{|}~]+@[a-z0-9.\\-]+\\.[a-z]+")
  17. reg, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`)
  18. // fmt.Println(r.FindStringSubmatch("주식회사 서울니트디자인 대표: 윤근영 사업자등록번호: 129-86-89635 통신판매업번호 : 2021-성남중원-0367 주소: 경기도 성남시 중원구 갈마치로 215, 금강펜테리움아이티타워 A동 309호(상대원동) 전화: 031-730-0525 이메일: SOYOON58@NAVER.COM COPYRIGHT(C) WWW.SEOULKNIT.COM. ALL RIGHTS RESERVED. HOSTING BY CODEM"))
  19. ss := reg.FindAllStringSubmatch(body, -1)
  20. m := make(map[string]bool)
  21. for _, s := range ss {
  22. e := s[0]
  23. if e == "support@crema.me" || e == "support@snapvi.co.kr" || strings.HasPrefix(e, "/") || strings.HasPrefix(e, "http://") ||
  24. strings.HasPrefix(e, "https://") || strings.HasSuffix(e, ".png") || strings.HasSuffix(e, ".jpg") || strings.HasSuffix(e, ".js") {
  25. continue
  26. }
  27. m[s[0]] = true
  28. }
  29. for k, _ := range m {
  30. ret = append(ret, k)
  31. }
  32. return ret
  33. }
  34. func Parse(uri string, item *model.ItemInfo) {
  35. body, err := util.Get(uri)
  36. util.CheckError(err)
  37. item.Language = GetLanguage(body)
  38. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  39. util.CheckError(err)
  40. doc.Find("meta").Each(func(i int, s *goquery.Selection) {
  41. v, _ := s.Attr("property")
  42. if v == "og:description" {
  43. item.ShortDesc = s.AttrOr("content", "")
  44. } else if v == "og:title" {
  45. item.ItemName = s.AttrOr("content", "")
  46. } else if v == "og:image" {
  47. item.Images = append(item.Images, s.AttrOr("content", ""))
  48. } else if v == "product:price:amount" {
  49. item.SalesPrice = util.GetFloat32(s.AttrOr("content", ""))
  50. } else if v == "product:price:currency" {
  51. item.Currency = s.AttrOr("content", "")
  52. }
  53. })
  54. return
  55. }