package main import ( "fmt" "log" "net/http" "regexp" "strings" "github.com/PuerkitoBio/goquery" ) func main() { // Wordpress // url := "https://addand.kr/shop/new-%ed%95%9c-%ea%b6%8c%ec%9c%bc%eb%a1%9c-%eb%81%9d%eb%82%98%eb%8a%94-%eb%85%b8%ec%85%98/" url := "https://seoulknit.com/shop/collar-half-sleeved-top_blue/" // Send an HTTP GET request to the URL response, err := http.Get(url) if err != nil { log.Fatal(err) } defer response.Body.Close() doc, err := goquery.NewDocumentFromReader(response.Body) if err != nil { log.Fatal(err) } // These will the value of PDP parsing structures cItemName := ".product_title" cShortDesc := ".woocommerce-product-details__short-description" cItemCategory := ".posted_in" cItemTags := ".tagged_as" cItemImages := ".woocommerce-product-gallery__image img" cItemTextDesc := "div.woocommerce-tabs" // cItemOptions := "select#rating option" cItemPrice := ".summary>.price .woocommerce-Price-amount.amount" cEmail := "footer" cRating := ".star-rating .rating" cAuthor := ".product-brand a" cVideo := "iframe" // Use the Find method to select elements that match the css selector // doc.Find(cItemName).Each(func(index int, element *goquery.Selection) { // // Extract the text associated with the selected element // text := element.Text() // fmt.Printf("Text associated with %s: %s\n", cItemName, text) // }) fmt.Println("ItemName: ", doc.Find(cItemName).First().Text()) fmt.Println("ShortDesc: ", doc.Find(cShortDesc).First().Text()) fmt.Println("ItemCategory: ", doc.Find(cItemCategory).First().Text()) fmt.Println("ItemTags: ", doc.Find(cItemTags).First().Text()) fmt.Println("TextDesc: ", doc.Find(cItemTextDesc).First().Text()) fmt.Println("ItemPice: ", doc.Find(cItemPrice).First().Text()) fmt.Println("Email: ", findEmail(doc.Find(cEmail).First().Text(), "")) fmt.Println("Rating : ", doc.Find(cRating).First().Text()) doc.Find("link ~ meta").Each(func(i int, s *goquery.Selection) { v, _ := s.Attr("property") fmt.Println(v, " : ", s.AttrOr("content", "")) }) fmt.Println("Author : ") doc.Find(cAuthor).Each(func(index int, element *goquery.Selection) { link, _ := element.Attr("href") fmt.Println("====================") fmt.Println("AuthorName: ", element.Text()) fmt.Printf("AuthorLink: %s\n", link) }) vdos := []string{} doc.Find(cVideo).Each(func(index int, element *goquery.Selection) { embed, _ := element.Attr("src") vdos = append(vdos, embed) }) fmt.Printf("ItemVideos: %s\n", vdos) imgs := []string{} fmt.Println("ItemImages: ") doc.Find(cItemImages).Each(func(index int, element *goquery.Selection) { pattern := `(_[0-9]+x[0-9]+|-[0-9]+x[0-9]+)$` img, _ := element.Attr("src") imgChop, lefter := ChopPath(img) im := strings.Split(imgChop, ".") imgFile := im[0] imgExt := im[1] // fmt.Println("imgFile", imgFile) // fmt.Println("imgExt", imgExt) // Compile the regular expression re := regexp.MustCompile(pattern) // Find the match at the end of the string matches := re.FindStringSubmatch(imgFile) if len(matches) > 1 { substring := matches[1] imgFile = strings.Replace(imgFile, substring, "", -1) } imgs = append(imgs, img[0:lefter+1]+imgFile+"."+imgExt) fmt.Println("", img[0:lefter+1]+imgFile+"."+imgExt) fmt.Println("", img) }) } func findEmail(body string, doms string) (emails []string) { r, _ := regexp.Compile(`[a-zA-Z0-9_+&*-]+(?:\.[a-zA-Z0-9_+&*-]+)*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,24}`) emails = append(emails, r.FindStringSubmatch(body)...) return } // return the source filename after the last slash func ChopPath(original string) (string, int) { i := strings.LastIndex(original, "/") if i == -1 { return original, i } else { return original[i+1:], i } }