돌리면 4시간..???
크롤링 코드입니다
chrome 웹드라이버를 다운받아야합니다.
conda 에서 selenium, beautifulsoup을 추가적으로 install 해야합니다
from selenium import webdriver
import time
# 파싱
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import json
# 향수 리스트 로딩
req = Request("<https://www.fragrantica.com/awards.php?show_mode=json_nominations&category_id=635>", headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
obj = json.loads(webpage)
# 로딩 갯수
collect_num = 200
nominations = obj['nominations'][0:collect_num]
nominations_url = [i['url'] for i in nominations]
nominations_name = [i['name'] for i in nominations]
# 세팅
chromedriver = '/Users/lunab/Desktop/chromedriver'
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(time_to_wait=5)
# 리뷰 로딩
database = []
base = '<https://www.fragrantica.com>'
for i in range(len(nominations_url)):
url = base + nominations_url[i]
driver.get(url)
time.sleep(1)
# accord
accords = driver.find_elements_by_css_selector('#main-content .cell .grid-x .cell.accord-box .accord-bar')
accords_list = list(map(lambda x: x.text, accords))
# scroll down (loading)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.getElementById('popBrands').offsetTop - 500)")
time.sleep(3)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
review_scroll = driver.find_elements_by_css_selector("#all-reviews .cell .grid-x .cell.fragrance-review-box .flex-child-auto div")
database.append({
'name': nominations_name[i],
'accords': accords_list,
'reviews': list(map(lambda x: x.text, review_scroll))
})
print(len(review_scroll))
# 저장
import pandas as pd
df = pd.DataFrame([[d['name'], d['accords'], r] for d in database for r in d['reviews']], columns=['name', 'accords','reivew'])
df.to_csv('dataset(unisex-best 2020).csv', index=False)
print(df.info())
print(df.head())
dataset(unisex-best 2020) (2).csv
2020.05.13 (목) 12:49