Crawling | Notion

돌리면 4시간..???

크롤링 코드입니다

chrome 웹드라이버를 다운받아야합니다.

conda 에서 selenium, beautifulsoup을 추가적으로 install 해야합니다

from selenium import webdriver
import time

# 파싱
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import json

# 향수 리스트 로딩
req = Request("<https://www.fragrantica.com/awards.php?show_mode=json_nominations&category_id=635>", headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
obj = json.loads(webpage)

# 로딩 갯수
collect_num = 200
nominations = obj['nominations'][0:collect_num]
nominations_url = [i['url'] for i in nominations]
nominations_name = [i['name'] for i in nominations]

# 세팅
chromedriver = '/Users/lunab/Desktop/chromedriver'
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(time_to_wait=5)

# 리뷰 로딩
database = []
base = '<https://www.fragrantica.com>'
for i in range(len(nominations_url)):
    url = base + nominations_url[i]
    driver.get(url)
    time.sleep(1)

    # accord
    accords = driver.find_elements_by_css_selector('#main-content .cell .grid-x .cell.accord-box .accord-bar')
    accords_list = list(map(lambda x: x.text, accords))

    # scroll down (loading)
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.getElementById('popBrands').offsetTop - 500)")
        time.sleep(3)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    review_scroll = driver.find_elements_by_css_selector("#all-reviews .cell .grid-x .cell.fragrance-review-box .flex-child-auto div")

    database.append({
        'name': nominations_name[i],
        'accords': accords_list,
        'reviews': list(map(lambda x: x.text, review_scroll))
    })

    print(len(review_scroll))

# 저장
import pandas as pd
df = pd.DataFrame([[d['name'], d['accords'], r] for d in database for r in d['reviews']], columns=['name', 'accords','reivew'])
df.to_csv('dataset(unisex-best 2020).csv', index=False)

print(df.info())
print(df.head())

unisex-best 2020

dataset(unisex-best 2020) (2).csv

(New!) unisex-best 2020 - 20개 모든리뷰 + accords

2020.05.13 (목) 12:49

dataset(unisex-best 2020).csv

(NewNew!) search 2020 - 50개 모든리뷰 (name, accords, review)

dataset(unisex-best 2020) (4).csv