163 lines
6.7 KiB
Python
163 lines
6.7 KiB
Python
import os
|
||
import pandas as pd
|
||
import random
|
||
import time
|
||
from bs4 import BeautifulSoup
|
||
from selenium import webdriver
|
||
|
||
# 豆瓣网从2017年10月开始全面禁止爬取数据。在非登录状态下仅仅可以爬取200条短评,登录状态下仅可以爬取500条数据。
|
||
# 白天一分钟最多可爬40次,晚上60次,超过次数就会封IP地址。
|
||
from selenium.webdriver.common.by import By
|
||
|
||
|
||
class CommentsCrawler(object):
|
||
"""
|
||
豆瓣评论爬虫类
|
||
"""
|
||
def __init__(self, subject_url, output_path, username, pwd):
|
||
self.subject_url = subject_url # 待爬取电影的url地址
|
||
self.output_path = output_path # 爬取结果保存的地址
|
||
self.username = username
|
||
self.pwd = pwd
|
||
|
||
if not os.path.exists(output_path):
|
||
os.makedirs(output_path)
|
||
|
||
self.all_comments_file = os.path.join(output_path, 'all_comments.csv')
|
||
# 下载对应版本的chrome driver: https://sites.google.com/a/chromium.org/chromedriver/downloads
|
||
# 将下载文件的路径配置到环境变量PATH中
|
||
# options = webdriver.ChromeOptions()
|
||
# options.add_argument("--user-data-dir='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\User Data\\Default'")
|
||
self.driver = webdriver.Chrome()
|
||
# 获取Chrome浏览器的cookie
|
||
cookies = self.driver.get_cookies()
|
||
# 将cookie添加到WebDriver对象中
|
||
for cookie in cookies:
|
||
self.driver.add_cookie(cookie)
|
||
self._simulate_login()
|
||
|
||
def _simulate_login(self):
|
||
"""
|
||
模拟登录
|
||
"""
|
||
# 模拟登录豆瓣
|
||
login_url = 'https://www.douban.com/'
|
||
self.driver.get(login_url)
|
||
self.driver.switch_to.frame(self.driver.find_elements(By.TAG_NAME, "iframe")[0])
|
||
|
||
# 点击"密码登录"
|
||
pwd_login = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/ul[1]/li[2]')
|
||
pwd_login.click()
|
||
|
||
# 输入账号
|
||
username_input = self.driver.find_element(By.XPATH, '//*[@id="username"]')
|
||
username_input.clear()
|
||
username_input.send_keys(self.username)
|
||
|
||
# 输入密码
|
||
pwd_input = self.driver.find_element(By.XPATH, '//*[@id="password"]')
|
||
pwd_input.clear()
|
||
pwd_input.send_keys(self.pwd)
|
||
|
||
# 登录
|
||
bottom = self.driver.find_element(By.CLASS_NAME, 'account-form-field-submit ')
|
||
bottom.click()
|
||
|
||
# 登录后等待10秒
|
||
time.sleep(20)
|
||
|
||
def get_all_comments(self):
|
||
"""
|
||
获取全剧豆瓣评论,最多能爬取500条评论
|
||
"""
|
||
user_id_list = [] # 用户id
|
||
user_city_list = [] # 用户所在城市
|
||
rating_list = [] # 爬取的评分列表
|
||
comment_date_list = [] # 爬取的评论日期列表
|
||
comment_list = [] # 爬取的评论列表
|
||
|
||
comment_index = 0 # 起始评论序号
|
||
# while comment_index<20:
|
||
while True:
|
||
# 全剧评论的起始页url
|
||
all_comm_url = self.subject_url + 'comments?start={}&limit=20&sort=new_score&status=P'.format(comment_index)
|
||
self.driver.get(all_comm_url)
|
||
print('正在爬取第{}页的记录...'.format(int(comment_index / 20 + 1)))
|
||
print(all_comm_url)
|
||
# 访问成功
|
||
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||
comment_tag_list = soup.find_all(class_='comment')
|
||
if len(comment_tag_list) > 0:
|
||
for comment_tag in comment_tag_list:
|
||
# 获取用户id
|
||
user_id = comment_tag.find(class_='comment-info').find('a').text.strip()
|
||
|
||
# 获取用户所在城市
|
||
# 获取用户主页地址
|
||
user_page_url = comment_tag.find(class_='comment-info').find('a').get('href')
|
||
self.driver.get(user_page_url)
|
||
user_soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||
try:
|
||
user_city = user_soup.find(class_='user-info').find('a').text.strip()
|
||
except Exception as e:
|
||
print('用户信息获取异常(无法获取用户城市,):', e)
|
||
user_city = ''
|
||
|
||
# 获取评分
|
||
rating = comment_tag.find(class_='rating').get('title').strip() \
|
||
if comment_tag.find(class_='rating') is not None else ''
|
||
# 获取评论的时间
|
||
comment_date = comment_tag.find(class_='comment-time').text.strip() \
|
||
if comment_tag.find(class_='comment-time') is not None else ''
|
||
# 获取评论内容
|
||
comment = comment_tag.find(class_='short').text.strip() \
|
||
if comment_tag.find(class_='short') is not None else ''
|
||
|
||
user_id_list.append(user_id)
|
||
user_city_list.append(user_city)
|
||
rating_list.append(rating)
|
||
comment_date_list.append(comment_date)
|
||
comment_list.append(comment)
|
||
|
||
comment_index += 20
|
||
|
||
# 在当前页随机停留的时间
|
||
time.sleep(random.random() * 3)
|
||
else:
|
||
# 如果当前页没有评论,则停止爬虫,保存结果
|
||
self._save_to_file(user_id_list, user_city_list, rating_list, comment_date_list, comment_list)
|
||
break
|
||
else:
|
||
# 如果当前页没有评论,则停止爬虫,保存结果
|
||
self._save_to_file(user_id_list, user_city_list, rating_list, comment_date_list, comment_list)
|
||
|
||
def _save_to_file(self, user_id_list, user_city_list, rating_list, comment_date_list, comment_list):
|
||
"""
|
||
保存爬取的结果
|
||
"""
|
||
results_df = pd.DataFrame()
|
||
results_df['user_id'] = user_id_list
|
||
results_df['city'] = user_city_list
|
||
results_df['rating'] = rating_list
|
||
results_df['date'] = comment_date_list
|
||
results_df['comment'] = comment_list
|
||
results_df.to_csv(self.all_comments_file, encoding='utf_8_sig', index=False)
|
||
|
||
print('已爬取{}条评论记录'.format(len(comment_list)))
|
||
print('结果保存在{}'.format(self.all_comments_file))
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 《长安十二时辰》豆瓣地址
|
||
subject_url = 'https://movie.douban.com/subject/26849758/'
|
||
output_path = './chang_an'
|
||
|
||
# # 《庆余年》豆瓣地址
|
||
# subject_url = 'https://movie.douban.com/subject/25853071/'
|
||
# output_path = './qingyunian'
|
||
|
||
username = '18511587398'
|
||
pwd = 'aihun2082486'
|
||
cc = CommentsCrawler(subject_url, output_path, username, pwd)
|
||
cc.get_all_comments()
|