tct-main.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. package controllers_scraper
  2. import (
  3. "crypto/md5"
  4. "encoding/hex"
  5. "encoding/json"
  6. "fmt"
  7. "kkscrap-go/controllers/scraper/cafe24"
  8. "kkscrap-go/controllers/scraper/dabory"
  9. "kkscrap-go/controllers/scraper/godo"
  10. "kkscrap-go/controllers/scraper/magento"
  11. parserData "kkscrap-go/controllers/scraper/parser"
  12. "kkscrap-go/controllers/scraper/shopify"
  13. "kkscrap-go/controllers/scraper/util"
  14. "kkscrap-go/controllers/scraper/wordpress"
  15. "kkscrap-go/controllers/scraper/young"
  16. "kkscrap-go/locals"
  17. "kkscrap-go/model"
  18. "net/http"
  19. "net/url"
  20. "regexp"
  21. "strings"
  22. e "github.com/dabory/abango-rest/etc"
  23. "github.com/gogf/gf/v2/encoding/gurl"
  24. // "golang.org/x/crypto/bcrypt"
  25. "github.com/labstack/echo"
  26. )
  27. type SolutionTypeGetReq struct {
  28. Url string
  29. }
  30. func SolutionTypeGet(c echo.Context) error {
  31. v := c.Get("receiver").(SolutionTypeGetReq)
  32. retv := &struct {
  33. SolutionType string
  34. ThemeType string
  35. }{}
  36. body, err := util.Get(v.Url)
  37. if err != nil {
  38. return c.JSON(404, err.Error())
  39. }
  40. // fmt.Println(v.Url)
  41. st, theme, err := getSolutionType(body)
  42. if err != nil {
  43. return c.JSONBlob(http.StatusOK, []byte(err.Error()))
  44. }
  45. retv.SolutionType = string(st)
  46. retv.ThemeType = theme
  47. return c.JSON(http.StatusOK, retv)
  48. }
  49. type ProductPageGetReq struct {
  50. SolutionType model.SolutionType
  51. ThemeType string
  52. Products []ProductUri
  53. Regexs string
  54. ConfigData string
  55. Config model.MatchingConfig
  56. Reindex bool
  57. }
  58. type ProductUri struct {
  59. Uri string
  60. }
  61. // 오리지널 웹사이트 전체를 업테이트 하는 경우 Uri를 하나씩 보내면 비효율적이므로 하나의 배치로
  62. // 묶어서 요청할 수 있도록 한다. 주로 product-page-get를 쓰고 item-url-scrap은 deprecate 예정임.
  63. func ProductPageGet(c echo.Context) error {
  64. v := c.Get("receiver").(ProductPageGetReq)
  65. // conf := model.MatchingConfig{}
  66. json.Unmarshal([]byte(v.ConfigData), &v.Config)
  67. var vRet locals.ProductPage // Row(개별레코드)->Page(Row의 집합)->Book(Page의 집합)의 개념
  68. for _, p := range v.Products {
  69. body, err := util.Get(p.Uri)
  70. if err != nil {
  71. e.ErrLog(e.FuncRun("03uoaiuor0", e.CurrFuncName()), err)
  72. }
  73. // 전체 웹사이트가 아니라 개별 상품페이지(1개페이지)의 경우 SolutionType 없이 request됨
  74. if v.SolutionType == "" {
  75. var err error
  76. v.SolutionType, v.ThemeType, err = getSolutionType(body)
  77. if err != nil {
  78. return c.String(705, "Solution Type Not Found:"+err.Error())
  79. }
  80. }
  81. // prodInfo := parseSolution(v.SolutionType, v.ThemeType, p.Uri, body)
  82. prodInfo := parsePageData(v.SolutionType, ItemUrlScrapReq{
  83. ItemUrl: p.Uri, Config: v.Config, Reindex: false,
  84. }, 0)
  85. vRet.ProductPage = append(vRet.ProductPage, toProductPage(prodInfo))
  86. }
  87. // ret, _ := json.MarshalIndent(itemInfo, "", "\t")
  88. ret, _ := json.Marshal(vRet)
  89. return c.JSONBlob(http.StatusOK, ret)
  90. }
  91. func toProductPage(info model.ItemInfo) locals.Product {
  92. ret := locals.Product{
  93. SolutionName: locals.SolutionType(info.SolutionName),
  94. Version: info.Version,
  95. Emails: info.Emails,
  96. DomainName: info.DomainName,
  97. DomainURI: info.DomainURI,
  98. ItemName: info.ItemName,
  99. ItemNick: info.ItemNick,
  100. ModelName: info.ModelName,
  101. ModelNo: info.ModelNo,
  102. BrandName: info.BrandName,
  103. Sku: info.Sku,
  104. ItemCategory: info.ItemCategory,
  105. Manufacturer: info.Manufacturer,
  106. Origin: info.Origin,
  107. Language: info.Language,
  108. Currency: info.Currency,
  109. SalesPrice: info.SalesPrice,
  110. DeliveryPrice: info.DeliveryPrice,
  111. MinimumQty: info.MinimumQty,
  112. UserCredit: info.UserCredit,
  113. Options: nil,
  114. Images: info.Images,
  115. ShortDesc: info.ShortDesc,
  116. OriginDesc: info.OriginDesc,
  117. TextDesc: info.TextDesc,
  118. Cats: info.Cats,
  119. Suggest: info.Suggest,
  120. HashUrl: info.HashUrl,
  121. HashContent: info.HashContent,
  122. }
  123. for i, v := range info.Options {
  124. ret.Options = append(ret.Options, locals.Option{
  125. Name: v.Name,
  126. })
  127. ret.Options[i].Choices = make([]locals.Choice, 0)
  128. for _, choice := range v.Choices {
  129. ret.Options[i].Choices = append(ret.Options[i].Choices, locals.Choice{
  130. Name: choice.Name,
  131. Price: choice.Price,
  132. })
  133. }
  134. }
  135. return ret
  136. }
  137. type ItemUrlScrapReq struct {
  138. ItemUrl string
  139. Regexs string
  140. ConfigData string
  141. Config model.MatchingConfig
  142. Reindex bool
  143. }
  144. func ItemUrlScrap(c echo.Context) error {
  145. v := c.Get("receiver").(ItemUrlScrapReq)
  146. // body, err := util.Get(v.ItemUrl)
  147. // if err != nil {
  148. // return err
  149. // }
  150. itemInfo, err := parse(v, 0)
  151. if err != nil {
  152. return c.String(604, "ertvwerawqfd-ItemUrl Parse failed: "+err.Error())
  153. }
  154. ret, _ := json.MarshalIndent(itemInfo, "", "\t")
  155. // fmt.Println(string(data))
  156. // ret, _ := json.Marshal(itemInfo)
  157. return c.JSONBlob(http.StatusOK, ret)
  158. }
  159. var regexpTitle *regexp.Regexp
  160. func init() {
  161. regexpTitle, _ = regexp.Compile("<title>(.*)</title>")
  162. }
  163. func getTitle(body string) string {
  164. ss := regexpTitle.FindAllStringSubmatch(body, 1)
  165. if len(ss) == 1 {
  166. return ss[0][1]
  167. }
  168. return ""
  169. }
  170. func parse(config ItemUrlScrapReq, interval int) (ret model.ItemInfo, err error) {
  171. conf := model.MatchingConfig{}
  172. json.Unmarshal([]byte(config.ConfigData), &conf)
  173. config.Config = conf
  174. body, _ := util.Get(config.ItemUrl)
  175. t, _, err := getSolutionType(body)
  176. if err != nil {
  177. return
  178. }
  179. ret = parsePageData(t, config, interval)
  180. return
  181. }
  182. func parseSolution(t model.SolutionType, theme, uri, body string) (ret model.ItemInfo) {
  183. ret.SolutionName = t
  184. u, err := url.Parse(uri)
  185. if err != nil {
  186. return
  187. }
  188. ret.DomainName = u.Host
  189. ret.DomainURI = uri
  190. if t == model.SolutionTypeWooCommerce {
  191. wordpress.Parse(body, &ret)
  192. } else if t == model.SolutionTypeDabory {
  193. dabory.Parse(body, &ret)
  194. } else if t == model.SolutionTypeShopify {
  195. shopify.Parse(body, &ret)
  196. } else if t == model.SolutionTypeMagento {
  197. magento.Parse(body, &ret)
  198. } else if t == model.SolutionTypeCafe24 {
  199. cafe24.Parse(body, &ret)
  200. } else if t == model.SolutionTypeGodo {
  201. godo.Parse(body, &ret)
  202. } else if t == model.SolutionTypeYoung {
  203. young.Parse(body, &ret)
  204. } else if t == model.SolutionTypeOthers {
  205. magento.Parse(body, &ret)
  206. }
  207. return
  208. }
  209. func parsePageData(t model.SolutionType, config ItemUrlScrapReq, interval int) (ret model.ItemInfo) {
  210. rawUrl, _ := gurl.RawDecode(config.ItemUrl)
  211. u, _ := url.Parse(rawUrl)
  212. hash := md5.New()
  213. ret = parserData.Parse(rawUrl, config.Config)
  214. println(fmt.Sprintf("%v", ret.Images))
  215. ret.DomainName = u.Host
  216. ret.SolutionName = t
  217. if ret.SolutionName == "" {
  218. ret.SolutionName = "NON"
  219. }
  220. ret.DomainURI = rawUrl
  221. ret.Cats = ret.ItemCategory
  222. hashContent, _ := json.Marshal(ret)
  223. hash.Write(hashContent)
  224. // if ret.Images == nil || len(ret.Images) == 0 {
  225. // return
  226. // }
  227. ret.HashContent = hex.EncodeToString((hash.Sum(nil)))
  228. hash.Reset()
  229. hash.Write([]byte(rawUrl))
  230. ret.HashUrl = hex.EncodeToString((hash.Sum([]byte(nil))))
  231. hash.Reset()
  232. ret.Suggest = strings.Split(ret.ItemName, " ")
  233. // time.Sleep(1 * time.Second)
  234. return
  235. }
  236. func getSolutionType(body string) (t model.SolutionType, theme string, reterr error) {
  237. if strings.Contains(body, "window.CAFE24") {
  238. t = model.SolutionTypeCafe24
  239. } else if strings.Contains(body, "dbrshop") {
  240. t = model.SolutionTypeDabory
  241. } else if strings.Contains(body, "/wp-content/") {
  242. t = model.SolutionTypeWooCommerce
  243. } else if strings.Contains(body, "고도몰5") {
  244. t = model.SolutionTypeGodo
  245. } else if strings.Contains(body, "cdn.shopify.com") {
  246. t = model.SolutionTypeShopify
  247. } else if strings.Contains(body, "magento") {
  248. t = model.SolutionTypeMagento
  249. } else if strings.Contains(body, "it_id=") {
  250. t = model.SolutionTypeYoung
  251. } else {
  252. t = model.SolutionTypeOthers
  253. //reterr = errors.New("no found solution type")
  254. }
  255. theme = ""
  256. re := regexp.MustCompile(`wp-content\/themes\/(.+?)\/`)
  257. result := re.FindStringSubmatch(body)
  258. if len(result) > 1 {
  259. theme = result[1]
  260. } else {
  261. theme = "generic"
  262. }
  263. //ioutil.WriteFile(string(t) + ".html", []byte(body), 644)
  264. return
  265. }