import os import pandas as pd import random import time from bs4 import BeautifulSoup from selenium import webdriver # 豆瓣网从2017年10月开始全面禁止爬取数据。在非登录状态下仅仅可以爬取200条短评,登录状态下仅可以爬取500条数据。 # 白天一分钟最多可爬40次,晚上60次,超过次数就会封IP地址。 from selenium.webdriver.common.by import By class CommentsCrawler(object): """ 豆瓣评论爬虫类 """ def __init__(self, subject_url, output_path, username, pwd): self.subject_url = subject_url # 待爬取电影的url地址 self.output_path = output_path # 爬取结果保存的地址 self.username = username self.pwd = pwd if not os.path.exists(output_path): os.makedirs(output_path) self.all_comments_file = os.path.join(output_path, 'all_comments.csv') # 下载对应版本的chrome driver: https://sites.google.com/a/chromium.org/chromedriver/downloads # 将下载文件的路径配置到环境变量PATH中 # options = webdriver.ChromeOptions() # options.add_argument("--user-data-dir='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\User Data\\Default'") self.driver = webdriver.Chrome() # 获取Chrome浏览器的cookie cookies = self.driver.get_cookies() # 将cookie添加到WebDriver对象中 for cookie in cookies: self.driver.add_cookie(cookie) self._simulate_login() def _simulate_login(self): """ 模拟登录 """ # 模拟登录豆瓣 login_url = 'https://www.douban.com/' self.driver.get(login_url) self.driver.switch_to.frame(self.driver.find_elements(By.TAG_NAME, "iframe")[0]) # 点击"密码登录" pwd_login = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/ul[1]/li[2]') pwd_login.click() # 输入账号 username_input = self.driver.find_element(By.XPATH, '//*[@id="username"]') username_input.clear() username_input.send_keys(self.username) # 输入密码 pwd_input = self.driver.find_element(By.XPATH, '//*[@id="password"]') pwd_input.clear() pwd_input.send_keys(self.pwd) # 登录 bottom = self.driver.find_element(By.CLASS_NAME, 'account-form-field-submit ') bottom.click() # 登录后等待10秒 time.sleep(20) def get_all_comments(self): """ 获取全剧豆瓣评论,最多能爬取500条评论 """ user_id_list = [] # 用户id user_city_list = [] # 用户所在城市 rating_list = [] # 爬取的评分列表 comment_date_list = [] # 爬取的评论日期列表 comment_list = [] # 爬取的评论列表 comment_index = 0 # 起始评论序号 # while comment_index<20: while True: # 全剧评论的起始页url all_comm_url = self.subject_url + 'comments?start={}&limit=20&sort=new_score&status=P'.format(comment_index) self.driver.get(all_comm_url) print('正在爬取第{}页的记录...'.format(int(comment_index / 20 + 1))) print(all_comm_url) # 访问成功 soup = BeautifulSoup(self.driver.page_source, 'html.parser') comment_tag_list = soup.find_all(class_='comment') if len(comment_tag_list) > 0: for comment_tag in comment_tag_list: # 获取用户id user_id = comment_tag.find(class_='comment-info').find('a').text.strip() # 获取用户所在城市 # 获取用户主页地址 user_page_url = comment_tag.find(class_='comment-info').find('a').get('href') self.driver.get(user_page_url) user_soup = BeautifulSoup(self.driver.page_source, 'html.parser') try: user_city = user_soup.find(class_='user-info').find('a').text.strip() except Exception as e: print('用户信息获取异常(无法获取用户城市,):', e) user_city = '' # 获取评分 rating = comment_tag.find(class_='rating').get('title').strip() \ if comment_tag.find(class_='rating') is not None else '' # 获取评论的时间 comment_date = comment_tag.find(class_='comment-time').text.strip() \ if comment_tag.find(class_='comment-time') is not None else '' # 获取评论内容 comment = comment_tag.find(class_='short').text.strip() \ if comment_tag.find(class_='short') is not None else '' user_id_list.append(user_id) user_city_list.append(user_city) rating_list.append(rating) comment_date_list.append(comment_date) comment_list.append(comment) comment_index += 20 # 在当前页随机停留的时间 time.sleep(random.random() * 3) else: # 如果当前页没有评论,则停止爬虫,保存结果 self._save_to_file(user_id_list, user_city_list, rating_list, comment_date_list, comment_list) break else: # 如果当前页没有评论,则停止爬虫,保存结果 self._save_to_file(user_id_list, user_city_list, rating_list, comment_date_list, comment_list) def _save_to_file(self, user_id_list, user_city_list, rating_list, comment_date_list, comment_list): """ 保存爬取的结果 """ results_df = pd.DataFrame() results_df['user_id'] = user_id_list results_df['city'] = user_city_list results_df['rating'] = rating_list results_df['date'] = comment_date_list results_df['comment'] = comment_list results_df.to_csv(self.all_comments_file, encoding='utf_8_sig', index=False) print('已爬取{}条评论记录'.format(len(comment_list))) print('结果保存在{}'.format(self.all_comments_file)) if __name__ == '__main__': # 《长安十二时辰》豆瓣地址 subject_url = 'https://movie.douban.com/subject/26849758/' output_path = './chang_an' # # 《庆余年》豆瓣地址 # subject_url = 'https://movie.douban.com/subject/25853071/' # output_path = './qingyunian' username = '18511587398' pwd = 'aihun2082486' cc = CommentsCrawler(subject_url, output_path, username, pwd) cc.get_all_comments()