package controllers_scraper import ( "crypto/md5" "encoding/hex" "encoding/json" "fmt" "kkscrap-go/controllers/scraper/cafe24" "kkscrap-go/controllers/scraper/dabory" "kkscrap-go/controllers/scraper/godo" "kkscrap-go/controllers/scraper/magento" parserData "kkscrap-go/controllers/scraper/parser" "kkscrap-go/controllers/scraper/shopify" "kkscrap-go/controllers/scraper/util" "kkscrap-go/controllers/scraper/wordpress" "kkscrap-go/controllers/scraper/young" "kkscrap-go/locals" "kkscrap-go/model" "net/http" "net/url" "regexp" "strings" e "github.com/dabory/abango-rest/etc" "github.com/gogf/gf/v2/encoding/gurl" // "golang.org/x/crypto/bcrypt" "github.com/labstack/echo" ) type SolutionTypeGetReq struct { Url string } func SolutionTypeGet(c echo.Context) error { v := c.Get("receiver").(SolutionTypeGetReq) retv := &struct { SolutionType string ThemeType string }{} body, err := util.Get(v.Url) if err != nil { return c.JSON(404, err.Error()) } // fmt.Println(v.Url) st, theme, err := getSolutionType(body) if err != nil { return c.JSONBlob(http.StatusOK, []byte(err.Error())) } retv.SolutionType = string(st) retv.ThemeType = theme return c.JSON(http.StatusOK, retv) } type ProductPageGetReq struct { SolutionType model.SolutionType ThemeType string Products []ProductUri Regexs string ConfigData string Config model.MatchingConfig Reindex bool } type ProductUri struct { Uri string } // 오리지널 웹사이트 전체를 업테이트 하는 경우 Uri를 하나씩 보내면 비효율적이므로 하나의 배치로 // 묶어서 요청할 수 있도록 한다. 주로 product-page-get를 쓰고 item-url-scrap은 deprecate 예정임. func ProductPageGet(c echo.Context) error { v := c.Get("receiver").(ProductPageGetReq) // conf := model.MatchingConfig{} json.Unmarshal([]byte(v.ConfigData), &v.Config) var vRet locals.ProductPage // Row(개별레코드)->Page(Row의 집합)->Book(Page의 집합)의 개념 for _, p := range v.Products { body, err := util.Get(p.Uri) if err != nil { e.ErrLog(e.FuncRun("03uoaiuor0", e.CurrFuncName()), err) } // 전체 웹사이트가 아니라 개별 상품페이지(1개페이지)의 경우 SolutionType 없이 request됨 if v.SolutionType == "" { var err error v.SolutionType, v.ThemeType, err = getSolutionType(body) if err != nil { return c.String(705, "Solution Type Not Found:"+err.Error()) } } // prodInfo := parseSolution(v.SolutionType, v.ThemeType, p.Uri, body) prodInfo := parsePageData(v.SolutionType, ItemUrlScrapReq{ ItemUrl: p.Uri, Config: v.Config, Reindex: false, }, 0) vRet.ProductPage = append(vRet.ProductPage, toProductPage(prodInfo)) } // ret, _ := json.MarshalIndent(itemInfo, "", "\t") ret, _ := json.Marshal(vRet) return c.JSONBlob(http.StatusOK, ret) } func toProductPage(info model.ItemInfo) locals.Product { ret := locals.Product{ SolutionName: locals.SolutionType(info.SolutionName), Version: info.Version, Emails: info.Emails, DomainName: info.DomainName, DomainURI: info.DomainURI, ItemName: info.ItemName, ItemNick: info.ItemNick, ModelName: info.ModelName, ModelNo: info.ModelNo, BrandName: info.BrandName, Sku: info.Sku, ItemCategory: info.ItemCategory, Manufacturer: info.Manufacturer, Origin: info.Origin, Language: info.Language, Currency: info.Currency, SalesPrice: info.SalesPrice, DeliveryPrice: info.DeliveryPrice, MinimumQty: info.MinimumQty, UserCredit: info.UserCredit, Options: nil, Images: info.Images, ShortDesc: info.ShortDesc, OriginDesc: info.OriginDesc, TextDesc: info.TextDesc, Cats: info.Cats, Suggest: info.Suggest, HashUrl: info.HashUrl, HashContent: info.HashContent, } for i, v := range info.Options { ret.Options = append(ret.Options, locals.Option{ Name: v.Name, }) ret.Options[i].Choices = make([]locals.Choice, 0) for _, choice := range v.Choices { ret.Options[i].Choices = append(ret.Options[i].Choices, locals.Choice{ Name: choice.Name, Price: choice.Price, }) } } return ret } type ItemUrlScrapReq struct { ItemUrl string Regexs string ConfigData string Config model.MatchingConfig Reindex bool } func ItemUrlScrap(c echo.Context) error { v := c.Get("receiver").(ItemUrlScrapReq) // body, err := util.Get(v.ItemUrl) // if err != nil { // return err // } itemInfo, err := parse(v, 0) if err != nil { return c.String(604, "ertvwerawqfd-ItemUrl Parse failed: "+err.Error()) } ret, _ := json.MarshalIndent(itemInfo, "", "\t") // fmt.Println(string(data)) // ret, _ := json.Marshal(itemInfo) return c.JSONBlob(http.StatusOK, ret) } var regexpTitle *regexp.Regexp func init() { regexpTitle, _ = regexp.Compile("(.*)") } func getTitle(body string) string { ss := regexpTitle.FindAllStringSubmatch(body, 1) if len(ss) == 1 { return ss[0][1] } return "" } func parse(config ItemUrlScrapReq, interval int) (ret model.ItemInfo, err error) { conf := model.MatchingConfig{} json.Unmarshal([]byte(config.ConfigData), &conf) config.Config = conf body, _ := util.Get(config.ItemUrl) t, _, err := getSolutionType(body) if err != nil { return } ret = parsePageData(t, config, interval) return } func parseSolution(t model.SolutionType, theme, uri, body string) (ret model.ItemInfo) { ret.SolutionName = t u, err := url.Parse(uri) if err != nil { return } ret.DomainName = u.Host ret.DomainURI = uri if t == model.SolutionTypeWooCommerce { wordpress.Parse(body, &ret) } else if t == model.SolutionTypeDabory { dabory.Parse(body, &ret) } else if t == model.SolutionTypeShopify { shopify.Parse(body, &ret) } else if t == model.SolutionTypeMagento { magento.Parse(body, &ret) } else if t == model.SolutionTypeCafe24 { cafe24.Parse(body, &ret) } else if t == model.SolutionTypeGodo { godo.Parse(body, &ret) } else if t == model.SolutionTypeYoung { young.Parse(body, &ret) } else if t == model.SolutionTypeOthers { magento.Parse(body, &ret) } return } func parsePageData(t model.SolutionType, config ItemUrlScrapReq, interval int) (ret model.ItemInfo) { rawUrl, _ := gurl.RawDecode(config.ItemUrl) u, _ := url.Parse(rawUrl) hash := md5.New() ret = parserData.Parse(rawUrl, config.Config) println(fmt.Sprintf("%v", ret.Images)) ret.DomainName = u.Host ret.SolutionName = t if ret.SolutionName == "" { ret.SolutionName = "NON" } ret.DomainURI = rawUrl ret.Cats = ret.ItemCategory hashContent, _ := json.Marshal(ret) hash.Write(hashContent) // if ret.Images == nil || len(ret.Images) == 0 { // return // } ret.HashContent = hex.EncodeToString((hash.Sum(nil))) hash.Reset() hash.Write([]byte(rawUrl)) ret.HashUrl = hex.EncodeToString((hash.Sum([]byte(nil)))) hash.Reset() ret.Suggest = strings.Split(ret.ItemName, " ") // time.Sleep(1 * time.Second) return } func getSolutionType(body string) (t model.SolutionType, theme string, reterr error) { if strings.Contains(body, "window.CAFE24") { t = model.SolutionTypeCafe24 } else if strings.Contains(body, "dbrshop") { t = model.SolutionTypeDabory } else if strings.Contains(body, "/wp-content/") { t = model.SolutionTypeWooCommerce } else if strings.Contains(body, "고도몰5") { t = model.SolutionTypeGodo } else if strings.Contains(body, "cdn.shopify.com") { t = model.SolutionTypeShopify } else if strings.Contains(body, "magento") { t = model.SolutionTypeMagento } else if strings.Contains(body, "it_id=") { t = model.SolutionTypeYoung } else { t = model.SolutionTypeOthers //reterr = errors.New("no found solution type") } theme = "" re := regexp.MustCompile(`wp-content\/themes\/(.+?)\/`) result := re.FindStringSubmatch(body) if len(result) > 1 { theme = result[1] } else { theme = "generic" } //ioutil.WriteFile(string(t) + ".html", []byte(body), 644) return }