123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- package controllers_scraper
- import (
- "crypto/md5"
- "encoding/hex"
- "encoding/json"
- "fmt"
- "kkscrap-go/controllers/scraper/cafe24"
- "kkscrap-go/controllers/scraper/dabory"
- "kkscrap-go/controllers/scraper/godo"
- "kkscrap-go/controllers/scraper/magento"
- parserData "kkscrap-go/controllers/scraper/parser"
- "kkscrap-go/controllers/scraper/shopify"
- "kkscrap-go/controllers/scraper/util"
- "kkscrap-go/controllers/scraper/wordpress"
- "kkscrap-go/controllers/scraper/young"
- "kkscrap-go/locals"
- "kkscrap-go/model"
- "net/http"
- "net/url"
- "regexp"
- "strings"
- e "github.com/dabory/abango-rest/etc"
- "github.com/gogf/gf/v2/encoding/gurl"
- // "golang.org/x/crypto/bcrypt"
- "github.com/labstack/echo"
- )
- type SolutionTypeGetReq struct {
- Url string
- }
- func SolutionTypeGet(c echo.Context) error {
- v := c.Get("receiver").(SolutionTypeGetReq)
- retv := &struct {
- SolutionType string
- ThemeType string
- }{}
- body, err := util.Get(v.Url)
- if err != nil {
- return c.JSON(404, err.Error())
- }
- // fmt.Println(v.Url)
- st, theme, err := getSolutionType(body)
- if err != nil {
- return c.JSONBlob(http.StatusOK, []byte(err.Error()))
- }
- retv.SolutionType = string(st)
- retv.ThemeType = theme
- return c.JSON(http.StatusOK, retv)
- }
- type ProductPageGetReq struct {
- SolutionType model.SolutionType
- ThemeType string
- Products []ProductUri
- Regexs string
- ConfigData string
- Config model.MatchingConfig
- Reindex bool
- }
- type ProductUri struct {
- Uri string
- }
- // 오리지널 웹사이트 전체를 업테이트 하는 경우 Uri를 하나씩 보내면 비효율적이므로 하나의 배치로
- // 묶어서 요청할 수 있도록 한다. 주로 product-page-get를 쓰고 item-url-scrap은 deprecate 예정임.
- func ProductPageGet(c echo.Context) error {
- v := c.Get("receiver").(ProductPageGetReq)
- // conf := model.MatchingConfig{}
- json.Unmarshal([]byte(v.ConfigData), &v.Config)
- var vRet locals.ProductPage // Row(개별레코드)->Page(Row의 집합)->Book(Page의 집합)의 개념
- for _, p := range v.Products {
- body, err := util.Get(p.Uri)
- if err != nil {
- e.ErrLog(e.FuncRun("03uoaiuor0", e.CurrFuncName()), err)
- }
- // 전체 웹사이트가 아니라 개별 상품페이지(1개페이지)의 경우 SolutionType 없이 request됨
- if v.SolutionType == "" {
- var err error
- v.SolutionType, v.ThemeType, err = getSolutionType(body)
- if err != nil {
- return c.String(705, "Solution Type Not Found:"+err.Error())
- }
- }
- // prodInfo := parseSolution(v.SolutionType, v.ThemeType, p.Uri, body)
- prodInfo := parsePageData(v.SolutionType, ItemUrlScrapReq{
- ItemUrl: p.Uri, Config: v.Config, Reindex: false,
- }, 0)
- vRet.ProductPage = append(vRet.ProductPage, toProductPage(prodInfo))
- }
- // ret, _ := json.MarshalIndent(itemInfo, "", "\t")
- ret, _ := json.Marshal(vRet)
- return c.JSONBlob(http.StatusOK, ret)
- }
- func toProductPage(info model.ItemInfo) locals.Product {
- ret := locals.Product{
- SolutionName: locals.SolutionType(info.SolutionName),
- Version: info.Version,
- Emails: info.Emails,
- DomainName: info.DomainName,
- DomainURI: info.DomainURI,
- ItemName: info.ItemName,
- ItemNick: info.ItemNick,
- ModelName: info.ModelName,
- ModelNo: info.ModelNo,
- BrandName: info.BrandName,
- Sku: info.Sku,
- ItemCategory: info.ItemCategory,
- Manufacturer: info.Manufacturer,
- Origin: info.Origin,
- Language: info.Language,
- Currency: info.Currency,
- SalesPrice: info.SalesPrice,
- DeliveryPrice: info.DeliveryPrice,
- MinimumQty: info.MinimumQty,
- UserCredit: info.UserCredit,
- Options: nil,
- Images: info.Images,
- ShortDesc: info.ShortDesc,
- OriginDesc: info.OriginDesc,
- TextDesc: info.TextDesc,
- Cats: info.Cats,
- Suggest: info.Suggest,
- HashUrl: info.HashUrl,
- HashContent: info.HashContent,
- }
- for i, v := range info.Options {
- ret.Options = append(ret.Options, locals.Option{
- Name: v.Name,
- })
- ret.Options[i].Choices = make([]locals.Choice, 0)
- for _, choice := range v.Choices {
- ret.Options[i].Choices = append(ret.Options[i].Choices, locals.Choice{
- Name: choice.Name,
- Price: choice.Price,
- })
- }
- }
- return ret
- }
- type ItemUrlScrapReq struct {
- ItemUrl string
- Regexs string
- ConfigData string
- Config model.MatchingConfig
- Reindex bool
- }
- func ItemUrlScrap(c echo.Context) error {
- v := c.Get("receiver").(ItemUrlScrapReq)
- // body, err := util.Get(v.ItemUrl)
- // if err != nil {
- // return err
- // }
- itemInfo, err := parse(v, 0)
- if err != nil {
- return c.String(604, "ertvwerawqfd-ItemUrl Parse failed: "+err.Error())
- }
- ret, _ := json.MarshalIndent(itemInfo, "", "\t")
- // fmt.Println(string(data))
- // ret, _ := json.Marshal(itemInfo)
- return c.JSONBlob(http.StatusOK, ret)
- }
- var regexpTitle *regexp.Regexp
- func init() {
- regexpTitle, _ = regexp.Compile("<title>(.*)</title>")
- }
- func getTitle(body string) string {
- ss := regexpTitle.FindAllStringSubmatch(body, 1)
- if len(ss) == 1 {
- return ss[0][1]
- }
- return ""
- }
- func parse(config ItemUrlScrapReq, interval int) (ret model.ItemInfo, err error) {
- conf := model.MatchingConfig{}
- json.Unmarshal([]byte(config.ConfigData), &conf)
- config.Config = conf
- body, _ := util.Get(config.ItemUrl)
- t, _, err := getSolutionType(body)
- if err != nil {
- return
- }
- ret = parsePageData(t, config, interval)
- return
- }
- func parseSolution(t model.SolutionType, theme, uri, body string) (ret model.ItemInfo) {
- ret.SolutionName = t
- u, err := url.Parse(uri)
- if err != nil {
- return
- }
- ret.DomainName = u.Host
- ret.DomainURI = uri
- if t == model.SolutionTypeWooCommerce {
- wordpress.Parse(body, &ret)
- } else if t == model.SolutionTypeDabory {
- dabory.Parse(body, &ret)
- } else if t == model.SolutionTypeShopify {
- shopify.Parse(body, &ret)
- } else if t == model.SolutionTypeMagento {
- magento.Parse(body, &ret)
- } else if t == model.SolutionTypeCafe24 {
- cafe24.Parse(body, &ret)
- } else if t == model.SolutionTypeGodo {
- godo.Parse(body, &ret)
- } else if t == model.SolutionTypeYoung {
- young.Parse(body, &ret)
- } else if t == model.SolutionTypeOthers {
- magento.Parse(body, &ret)
- }
- return
- }
- func parsePageData(t model.SolutionType, config ItemUrlScrapReq, interval int) (ret model.ItemInfo) {
- rawUrl, _ := gurl.RawDecode(config.ItemUrl)
- u, _ := url.Parse(rawUrl)
- hash := md5.New()
- ret = parserData.Parse(rawUrl, config.Config)
- println(fmt.Sprintf("%v", ret.Images))
- ret.DomainName = u.Host
- ret.SolutionName = t
- if ret.SolutionName == "" {
- ret.SolutionName = "NON"
- }
- ret.DomainURI = rawUrl
- ret.Cats = ret.ItemCategory
- hashContent, _ := json.Marshal(ret)
- hash.Write(hashContent)
- // if ret.Images == nil || len(ret.Images) == 0 {
- // return
- // }
- ret.HashContent = hex.EncodeToString((hash.Sum(nil)))
- hash.Reset()
- hash.Write([]byte(rawUrl))
- ret.HashUrl = hex.EncodeToString((hash.Sum([]byte(nil))))
- hash.Reset()
- ret.Suggest = strings.Split(ret.ItemName, " ")
- // time.Sleep(1 * time.Second)
- return
- }
- func getSolutionType(body string) (t model.SolutionType, theme string, reterr error) {
- if strings.Contains(body, "window.CAFE24") {
- t = model.SolutionTypeCafe24
- } else if strings.Contains(body, "dbrshop") {
- t = model.SolutionTypeDabory
- } else if strings.Contains(body, "/wp-content/") {
- t = model.SolutionTypeWooCommerce
- } else if strings.Contains(body, "고도몰5") {
- t = model.SolutionTypeGodo
- } else if strings.Contains(body, "cdn.shopify.com") {
- t = model.SolutionTypeShopify
- } else if strings.Contains(body, "magento") {
- t = model.SolutionTypeMagento
- } else if strings.Contains(body, "it_id=") {
- t = model.SolutionTypeYoung
- } else {
- t = model.SolutionTypeOthers
- //reterr = errors.New("no found solution type")
- }
- theme = ""
- re := regexp.MustCompile(`wp-content\/themes\/(.+?)\/`)
- result := re.FindStringSubmatch(body)
- if len(result) > 1 {
- theme = result[1]
- } else {
- theme = "generic"
- }
- //ioutil.WriteFile(string(t) + ".html", []byte(body), 644)
- return
- }
|