Data/Crawling

8. 쿠팡 상품 크롤링

코딩마루 2024. 4. 7. 13:48

1. 쿠팡 상품 크롤링

쿠팡에 원하는 검색어를 검색하고 상위 100위까지를 크롤링하는 코드다. 크롤링 결과는 액셀에 저장한다.

import openpyxl
import requests
from bs4 import BeautifulSoup

def select_by_css_selector(soup, css_selector: str):
    try:
        result = soup.select_one(css_selector).get_text().strip()
    except AttributeError:
        result = "None"
    return result

if __name__ == "__main__":
    
    workbook = openpyxl.Workbook("coupang_crowling_results.xlsx")
    keywords = ["마우스", "키보드", "모니터"]

    for keyword in keywords:
        rank = 1
        page = 1
        
        work_sheet = workbook.create_sheet(keyword)
        work_sheet.append(['순위', '브랜드명', '상품명', '가격', 'URL'])
        while rank <= 100:
            search_url = "https://www.coupang.com/np/search?component=&q={}&page={}&channel=user".format(keyword, page)
            
            # 쿠팡 requests 요청시 아래 헤더를 작성하지 않으면 작동하지 않는다.
            user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
            accept_language = "ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3"
            headers = {"User-Agent": user_agent, "Accept-Language": accept_language}
            
            response = requests.get(search_url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")
            
            a_tags = soup.select("a.search-product-link")
            
            prefix_url = "https://www.coupang.com"
            for a_tag in a_tags:
                suffix_url = a_tag.attrs["href"]
                product_url = prefix_url+suffix_url
                
                product_response = requests.get(product_url, headers=headers)
                soup = BeautifulSoup(product_response.text, "html.parser")
                
                brand_name = select_by_css_selector(soup, ".prod-brand-name")
                title = select_by_css_selector(soup, "h2.prod-buy-header__title")
                current_price = select_by_css_selector(soup, "span.total-price > strong")
                
                work_sheet.append([rank, brand_name, title, current_price, product_url])
                print("[{}] {}. 브랜드: {} / 제목: {} / 가격: {}".format(keyword, rank, brand_name, title, current_price))
                rank += 1
                
                if rank > 100:
                    break
                
    workbook.save("coupang_crowling_results.xlsx")

 


참고자료: 

1. https://www.inflearn.com/course/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%ED%81%AC%EB%A1%A4%EB%A7%81-%EC%8B%A4%EC%A0%84/dashboard

2. https://www.inflearn.com/questions/622674/쿠팡-requests에-오류가-생기네요