feat: 增加头条同步微博模块
This commit is contained in:
parent
7b71af88e3
commit
10966b2be9
2
.idea/PythonTest.iml
generated
2
.idea/PythonTest.iml
generated
@ -4,7 +4,7 @@
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (PythonTest)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (PythonTest)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
||||
</project>
|
162
DouBan.py
Normal file
162
DouBan.py
Normal file
@ -0,0 +1,162 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import random
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
|
||||
# 豆瓣网从2017年10月开始全面禁止爬取数据。在非登录状态下仅仅可以爬取200条短评,登录状态下仅可以爬取500条数据。
|
||||
# 白天一分钟最多可爬40次,晚上60次,超过次数就会封IP地址。
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
class CommentsCrawler(object):
|
||||
"""
|
||||
豆瓣评论爬虫类
|
||||
"""
|
||||
def __init__(self, subject_url, output_path, username, pwd):
|
||||
self.subject_url = subject_url # 待爬取电影的url地址
|
||||
self.output_path = output_path # 爬取结果保存的地址
|
||||
self.username = username
|
||||
self.pwd = pwd
|
||||
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path)
|
||||
|
||||
self.all_comments_file = os.path.join(output_path, 'all_comments.csv')
|
||||
# 下载对应版本的chrome driver: https://sites.google.com/a/chromium.org/chromedriver/downloads
|
||||
# 将下载文件的路径配置到环境变量PATH中
|
||||
# options = webdriver.ChromeOptions()
|
||||
# options.add_argument("--user-data-dir='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\User Data\\Default'")
|
||||
self.driver = webdriver.Chrome()
|
||||
# 获取Chrome浏览器的cookie
|
||||
cookies = self.driver.get_cookies()
|
||||
# 将cookie添加到WebDriver对象中
|
||||
for cookie in cookies:
|
||||
self.driver.add_cookie(cookie)
|
||||
self._simulate_login()
|
||||
|
||||
def _simulate_login(self):
|
||||
"""
|
||||
模拟登录
|
||||
"""
|
||||
# 模拟登录豆瓣
|
||||
login_url = 'https://www.douban.com/'
|
||||
self.driver.get(login_url)
|
||||
self.driver.switch_to.frame(self.driver.find_elements(By.TAG_NAME, "iframe")[0])
|
||||
|
||||
# 点击"密码登录"
|
||||
pwd_login = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/ul[1]/li[2]')
|
||||
pwd_login.click()
|
||||
|
||||
# 输入账号
|
||||
username_input = self.driver.find_element(By.XPATH, '//*[@id="username"]')
|
||||
username_input.clear()
|
||||
username_input.send_keys(self.username)
|
||||
|
||||
# 输入密码
|
||||
pwd_input = self.driver.find_element(By.XPATH, '//*[@id="password"]')
|
||||
pwd_input.clear()
|
||||
pwd_input.send_keys(self.pwd)
|
||||
|
||||
# 登录
|
||||
bottom = self.driver.find_element(By.CLASS_NAME, 'account-form-field-submit ')
|
||||
bottom.click()
|
||||
|
||||
# 登录后等待10秒
|
||||
time.sleep(20)
|
||||
|
||||
def get_all_comments(self):
|
||||
"""
|
||||
获取全剧豆瓣评论,最多能爬取500条评论
|
||||
"""
|
||||
user_id_list = [] # 用户id
|
||||
user_city_list = [] # 用户所在城市
|
||||
rating_list = [] # 爬取的评分列表
|
||||
comment_date_list = [] # 爬取的评论日期列表
|
||||
comment_list = [] # 爬取的评论列表
|
||||
|
||||
comment_index = 0 # 起始评论序号
|
||||
# while comment_index<20:
|
||||
while True:
|
||||
# 全剧评论的起始页url
|
||||
all_comm_url = self.subject_url + 'comments?start={}&limit=20&sort=new_score&status=P'.format(comment_index)
|
||||
self.driver.get(all_comm_url)
|
||||
print('正在爬取第{}页的记录...'.format(int(comment_index / 20 + 1)))
|
||||
print(all_comm_url)
|
||||
# 访问成功
|
||||
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||||
comment_tag_list = soup.find_all(class_='comment')
|
||||
if len(comment_tag_list) > 0:
|
||||
for comment_tag in comment_tag_list:
|
||||
# 获取用户id
|
||||
user_id = comment_tag.find(class_='comment-info').find('a').text.strip()
|
||||
|
||||
# 获取用户所在城市
|
||||
# 获取用户主页地址
|
||||
user_page_url = comment_tag.find(class_='comment-info').find('a').get('href')
|
||||
self.driver.get(user_page_url)
|
||||
user_soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||||
try:
|
||||
user_city = user_soup.find(class_='user-info').find('a').text.strip()
|
||||
except Exception as e:
|
||||
print('用户信息获取异常(无法获取用户城市,):', e)
|
||||
user_city = ''
|
||||
|
||||
# 获取评分
|
||||
rating = comment_tag.find(class_='rating').get('title').strip() \
|
||||
if comment_tag.find(class_='rating') is not None else ''
|
||||
# 获取评论的时间
|
||||
comment_date = comment_tag.find(class_='comment-time').text.strip() \
|
||||
if comment_tag.find(class_='comment-time') is not None else ''
|
||||
# 获取评论内容
|
||||
comment = comment_tag.find(class_='short').text.strip() \
|
||||
if comment_tag.find(class_='short') is not None else ''
|
||||
|
||||
user_id_list.append(user_id)
|
||||
user_city_list.append(user_city)
|
||||
rating_list.append(rating)
|
||||
comment_date_list.append(comment_date)
|
||||
comment_list.append(comment)
|
||||
|
||||
comment_index += 20
|
||||
|
||||
# 在当前页随机停留的时间
|
||||
time.sleep(random.random() * 3)
|
||||
else:
|
||||
# 如果当前页没有评论,则停止爬虫,保存结果
|
||||
self._save_to_file(user_id_list, user_city_list, rating_list, comment_date_list, comment_list)
|
||||
break
|
||||
else:
|
||||
# 如果当前页没有评论,则停止爬虫,保存结果
|
||||
self._save_to_file(user_id_list, user_city_list, rating_list, comment_date_list, comment_list)
|
||||
|
||||
def _save_to_file(self, user_id_list, user_city_list, rating_list, comment_date_list, comment_list):
|
||||
"""
|
||||
保存爬取的结果
|
||||
"""
|
||||
results_df = pd.DataFrame()
|
||||
results_df['user_id'] = user_id_list
|
||||
results_df['city'] = user_city_list
|
||||
results_df['rating'] = rating_list
|
||||
results_df['date'] = comment_date_list
|
||||
results_df['comment'] = comment_list
|
||||
results_df.to_csv(self.all_comments_file, encoding='utf_8_sig', index=False)
|
||||
|
||||
print('已爬取{}条评论记录'.format(len(comment_list)))
|
||||
print('结果保存在{}'.format(self.all_comments_file))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 《长安十二时辰》豆瓣地址
|
||||
subject_url = 'https://movie.douban.com/subject/26849758/'
|
||||
output_path = './chang_an'
|
||||
|
||||
# # 《庆余年》豆瓣地址
|
||||
# subject_url = 'https://movie.douban.com/subject/25853071/'
|
||||
# output_path = './qingyunian'
|
||||
|
||||
username = '18511587398'
|
||||
pwd = 'aihun2082486'
|
||||
cc = CommentsCrawler(subject_url, output_path, username, pwd)
|
||||
cc.get_all_comments()
|
@ -24,7 +24,7 @@ def traverse_dir(path):
|
||||
# 打开配置文件,读取用户配置的
|
||||
def openConfigJson(path):
|
||||
# 读取json配置,获取要抽取的表名
|
||||
with open(path, "r") as f:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
configMap = json.load(f)
|
||||
return configMap
|
||||
|
||||
@ -83,8 +83,8 @@ if __name__ == '__main__':
|
||||
tableNameList = list()
|
||||
configMap = openConfigJson(jsonPath)
|
||||
if configMap["tables"] and len(configMap["tables"]) > 0:
|
||||
for tableName in set(configMap["tables"]):
|
||||
tableNameList.append(tableName)
|
||||
for tableInfo in configMap["tables"]:
|
||||
tableNameList.append(tableInfo["table"])
|
||||
print(tableNameList)
|
||||
else:
|
||||
raise AttributeError("config.json文件中没有配置抽取数据的表名")
|
||||
|
67
chang_an/SyncToutiao2Weibo.py
Normal file
67
chang_an/SyncToutiao2Weibo.py
Normal file
@ -0,0 +1,67 @@
|
||||
import requests
|
||||
from requests_html import HTMLSession, HTML
|
||||
from datetime import timedelta
|
||||
import json
|
||||
import datetime
|
||||
|
||||
# 获取欧洲足球时评微头条内容
|
||||
"""
|
||||
获取微头条数据信息
|
||||
"""
|
||||
|
||||
max_behot_time = '0'
|
||||
def getWeiToutiaoInfo():
|
||||
global max_behot_time
|
||||
# 获取上一周的第一天和最后一天的时间戳
|
||||
lastWeekStartTime, lastWeekEndTime = getLastWeekFirstDayTimeStamp()
|
||||
session = HTMLSession()
|
||||
weitoutiaoGet = session.get(f"https://www.toutiao.com/api/pc/list/user/feed?category=pc_profile_ugc&token=MS4wLjABAAAA7lHc4sBPuZaQ85qdIrwVvWm8Ps5O1kPMpuh5lTJAwII&max_behot_time={max_behot_time}&aid=24&app_name=toutiao_web", verify=False, proxies=None)
|
||||
# print(weitoutiaoGet.text)
|
||||
# 使用Json解析返回的数据
|
||||
resultJson = json.loads(weitoutiaoGet.text)
|
||||
# 先获取下个max_behot_time时间戳,如果需要翻页查询需要使用该参数
|
||||
max_behot_time = resultJson['next']['max_behot_time']
|
||||
# 开始循环解析data数据,获取微头条的内容和发布人员
|
||||
dataList = resultJson['data']
|
||||
# 获取每一条消息的发布时间戳,如果发布时间出现早于上周一0点的数据,结束统计,函数返回,否则递归调用当前函数
|
||||
for data in dataList:
|
||||
# 获取本条新闻的发布时间
|
||||
publishTime = data["publish_time"]
|
||||
if int(publishTime) > int(lastWeekEndTime):
|
||||
continue
|
||||
elif int(publishTime) < int(lastWeekStartTime):
|
||||
return
|
||||
else:
|
||||
# 如果是比分预测类的微头条,则跳过
|
||||
content = str(data['content'])
|
||||
print(content)
|
||||
# if content.find("比分预测】")>0:
|
||||
# continue
|
||||
# else:
|
||||
# # 获取该微头条的发布人和发布日期
|
||||
# author = None
|
||||
# contentSubString = content[-8:]
|
||||
# if contentSubString.find("太能喵")>0:
|
||||
# weitoutiaoMap["太能喵"] = weitoutiaoMap["太能喵"]+1
|
||||
# elif contentSubString.find("小小")>0:
|
||||
# weitoutiaoMap["小小"] = weitoutiaoMap["小小"]+1
|
||||
# elif contentSubString.find("大帝强") > 0:
|
||||
# weitoutiaoMap["大帝强"] = weitoutiaoMap["大帝强"] + 1
|
||||
# elif contentSubString.find("叶小欢") > 0:
|
||||
# weitoutiaoMap["叶小欢"] = weitoutiaoMap["叶小欢"] + 1
|
||||
getWeiToutiaoInfo()
|
||||
|
||||
"""
|
||||
获取当前日期上一周的第一天和最后一天的时间戳
|
||||
"""
|
||||
def getLastWeekFirstDayTimeStamp():
|
||||
now = datetime.datetime.now()
|
||||
# 上周第一天和最后一天
|
||||
last_week_start = now - timedelta(days=now.weekday() + 7, hours=now.hour, minutes=now.minute, seconds=now.second, microseconds=now.microsecond)
|
||||
last_week_end = last_week_start - timedelta(days=-7)
|
||||
print(f"上周第一天的日期是:{last_week_start},最后一天的日期是:{last_week_end}")
|
||||
print(f"上周第一天的时间戳是:{last_week_start.timestamp()},最后一天的时间戳是:{last_week_end.timestamp()}")
|
||||
return last_week_start.timestamp(), last_week_end.timestamp()
|
||||
|
||||
if __name__ == '__main__':
|
||||
getWeiToutiaoInfo()
|
23
chang_an/all_comments.csv
Normal file
23
chang_an/all_comments.csv
Normal file
@ -0,0 +1,23 @@
|
||||
user_id,city,rating,date,comment
|
||||
西年,河北邯郸,推荐,2019-06-27 21:12:26,唐朝只有道士的簪子是竖着插,很少有剧组会注意到这点,所以这部剧的道具是真的用心,下了功夫研究的。服装化妆也都精致。一个最多时有上百个群演的组,每天出妆7小时。
|
||||
天是红河岸,上海,力荐,2019-06-27 20:39:18,打光摄影服化道都用心,曹盾的画面一如既往靠谱,质感太棒了!应该是本年度最佳古装了。
|
||||
秃头小宝贝,湖北武汉,力荐,2019-06-27 20:39:17,终于播了,从2017年官宣开始,我就在期待了!没想到雷佳音古装也这么合适,易烊千玺的气质也非常适合李泌。
|
||||
鸟倦知还,上海,推荐,2019-06-27 21:06:28,《妖猫传》之后在电视剧里面看到这么精致的盛唐太震撼了,能播出真是2019古装剧的华采。以雷佳音的张小敬和李必为中心的群像剧,有悬疑色彩,但更多的是不灭的赤子之心。很有诚意的国产精品作。
|
||||
欢乐分裂,上海,还行,2019-06-29 18:52:08,还是过誉了,开篇的盛唐气象和长镜穿梭很有感觉,服化道之讲究自不赘言。但作为电视剧核心的剧本,在节奏铺排上仍存有相当纰漏——将十二时辰的物理时间长度填满25集的长度,势必要穿插无数人和事(包括闪回),于是叙事点和各事件的剪接变得非常重要,而至中段看来,旁枝逸出的细节虽有利于完成破案拼图,究其自身不少细节是拖沓的,因此整体有不畅之处,作为悬疑权斗剧来说,摊子铺得过大的话,一来拖节奏后腿,二则也不容易收场自圆其说。选角倒是意外契合,四字表现不错,雷佳音虽可爱但角色同质化太雷同了吧。
|
||||
呜昂王,"Reykjavík, Iceland",力荐,2019-06-27 21:26:28,一天内拯救西安终于上了,不枉我等了这么久,这个剧太符合我的口味,服装造型场景设计以及台词太吃的下去了,演员选的也好,雷佳音把张小敬演活了每一个眼神动作台词我觉得非常到位!易烊千玺把李必高冷清净端正的感觉演出来了,好多人说弟弟台词不好,确实跟老戏骨比稍显稚嫩,但是也能看到他的努力。这部剧我吹爆每一位演员和幕后制作人员感谢你们还原了长安,西安人听着坊名地名倍感亲切!
|
||||
掉线,北京,推荐,2019-06-27 22:37:32,摄影太赞了,构图和色调真的是如盛唐画卷一般,无论是细节还是大场面都看得出背后制作的用心和规模。故事时间限定在十二时辰当中,快节奏的剧情让人很有追下去的欲望,第一集便用案件架设了悬念。易烊千玺和雷佳音的表演也都不错,这个剧真的很可!
|
||||
元气少女楼跟跟,,力荐,2019-06-27 20:52:00,期待已久的剧终于开播了!好喜欢呀!易烊千玺的表现很惊喜!
|
||||
momo,浙江杭州,推荐,2019-06-27 20:44:34,能在2019看到这部戏简直是人生之幸,雷佳音的张小敬是少见的不想笑的角色,看着一身杀伐气就知道他拼的有多值。拿到好剧本的曹盾宛如开了挂一样,实力派×潜力股,这个配置,绝了。
|
||||
把噗,北京,推荐,2019-06-27 23:12:26,年度第一良心国产古装剧。开头几段长镜头极尽影像美学,调色和构图很有电影质感。场景、道具、服装……十分考究,还原了一个盛极而衰的大唐长安。雷佳音的表演有意想不到的反差,千玺弟弟真的可以,演技到位,惊喜之上。
|
||||
Shirleysays,北京,力荐,2019-06-27 23:33:12,看了十集。改编走心,演员专心,舞美用心,古装剧里少见的良心剧。易烊千玺的扛鼎之作,从出现在镜头里的那一刻起,就令人刮目相看了,面对十七岁的自己,他真是太赞了👍。
|
||||
碎碎念,河南郑州,力荐,2019-06-27 21:45:16,感觉没港台的掺合,大陆班底慢慢走回正轨了!
|
||||
蓥鄞一米,上海,推荐,2019-06-27 21:10:36,草灰蛇线的几集,节奏稳的一批,很少见这么大部头扎扎实实的古装剧了。雷佳音演的很拼,易烊千玺难得的hold住了,作为四字弟弟第一部影视作品,这个成绩单交的相当漂亮。
|
||||
理想蔚藍,上海,力荐,2019-06-28 00:03:33,精致!长安反恐24小时!
|
||||
艮艮,,推荐,2019-06-27 21:08:02,终于有一部期待的剧播出了,太难得了!画面服化道也太棒了,整体节奏UPUP的快,让人恨不得一天看完全集。长安城里,看似平静的世道啊,早就变天了。
|
||||
Chestnut.,浙江宁波,力荐,2019-06-28 23:09:29,服化道美呆,长安众生相拍的太好了!!!吹爆大头演技!四字竟然表现的还8错。
|
||||
大浪淘沙,"Nairobi, Kenya",力荐,2019-06-27 21:35:32,很有质感的国产电视剧 制作很惊喜 演员台词和表情都很到位 看过原著 期待后面的剧情
|
||||
嘴巴嘟嘟,北京,力荐,2019-06-27 21:56:36,终于来了,看了第一集,先马一下,质感真的超级赞,主演演技都很好,原音好评
|
||||
凹凸,上海,推荐,2019-06-27 23:20:02,"《绣春刀》之后又看雷佳音演古装剧,台词功力太厉害,一开口就是戏。
|
||||
有被制作惊艳到,光看开场的长安城长镜头就知道出手不凡,之后一段室内打戏的穿墙长镜头更是调度惊人。不仅摄影构图和后期滤镜考究,配乐和服道化也都是电影级的精良制作,现在连网剧都这么有水准了么?
|
||||
头几集就铺了个大局,先给四星,持续观望下。"
|
||||
宁弍,,力荐,2019-06-27 23:03:50,台词、布景、细节、服化、收音、原声,包括原本担心的节奏都跟上了,大头和四字演技都在线,客观说四字台词还需打磨,但不妨碍这部剧好看呀。
|
|
Loading…
x
Reference in New Issue
Block a user