parser_common.go 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. package common
  2. import (
  3. "kkscrap-go/model"
  4. "regexp"
  5. "strings"
  6. util "kkscrap-go/controllers/scraper/util"
  7. "github.com/PuerkitoBio/goquery"
  8. )
  9. func GetLanguage(body string) string {
  10. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  11. util.CheckError(err)
  12. ret, _ := doc.Find("html").Attr("lang")
  13. return ret
  14. }
  15. func GetEmails(body string) (ret []string) {
  16. reg := regexp.MustCompile("[a-zA-Z0-9.!#$%&*+\\-/=?^_`{|}~]+@[a-z0-9.\\-]+\\.[a-z]+")
  17. ss := reg.FindAllStringSubmatch(body, -1)
  18. m := make(map[string]bool)
  19. for _, s := range ss {
  20. e := s[0]
  21. if e == "support@crema.me" || e == "support@snapvi.co.kr" || strings.HasPrefix(e, "/") || strings.HasPrefix(e, "http://") ||
  22. strings.HasPrefix(e, "https://") || strings.HasSuffix(e, ".png") || strings.HasSuffix(e, ".jpg") || strings.HasSuffix(e, ".js") {
  23. continue
  24. }
  25. m[s[0]] = true
  26. }
  27. for k, _ := range m {
  28. ret = append(ret, k)
  29. }
  30. return ret
  31. }
  32. func Parse(uri string, item *model.ItemInfo) {
  33. body, err := util.Get(uri)
  34. util.CheckError(err)
  35. item.Language = GetLanguage(body)
  36. doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
  37. util.CheckError(err)
  38. doc.Find("meta").Each(func(i int, s *goquery.Selection) {
  39. v, _ := s.Attr("property")
  40. if v == "og:description" {
  41. item.ShortDesc = s.AttrOr("content", "")
  42. } else if v == "og:title" {
  43. item.ItemName = s.AttrOr("content", "")
  44. } else if v == "og:image" {
  45. item.Images = append(item.Images, s.AttrOr("content", ""))
  46. } else if v == "product:price:amount" {
  47. item.SalesPrice = util.GetFloat32(s.AttrOr("content", ""))
  48. } else if v == "product:price:currency" {
  49. item.Currency = s.AttrOr("content", "")
  50. }
  51. })
  52. return
  53. }