feat: 尝试自动统计文章

This commit is contained in:
xiaoyan 2023-09-25 18:08:32 +08:00
parent 5eb218cf65
commit 24b6d4ca44
3 changed files with 4792 additions and 6 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,4 +1,5 @@
import datetime import datetime
import warnings
from datetime import timedelta from datetime import timedelta
from nonebot.adapters.onebot.v11 import Bot, Event from nonebot.adapters.onebot.v11 import Bot, Event
from nonebot.typing import T_State from nonebot.typing import T_State
@ -8,7 +9,10 @@ from nonebot import on_command
from nonebot.rule import to_me from nonebot.rule import to_me
from requests_html import HTMLSession, HTML from requests_html import HTMLSession, HTML
import json import json
from selenium.webdriver import Chrome, ChromeOptions
driver_path = '../../../../driver/cromedriver_win32'
max_behot_time = '0' max_behot_time = '0'
# 初始化map对象分别记录每个人的微头条个数 # 初始化map对象分别记录每个人的微头条个数
weitoutiaoMap = {"太能喵": 0, "小小": 0, "大帝强": 0, "叶小欢": 0} weitoutiaoMap = {"太能喵": 0, "小小": 0, "大帝强": 0, "叶小欢": 0}
@ -45,16 +49,74 @@ def getWeiToutiaoInfo():
else: else:
# 获取该微头条的发布人和发布日期 # 获取该微头条的发布人和发布日期
author = None author = None
if content.find("太能喵")>0: contentSubString = content[-8]
if contentSubString.find("太能喵")>0:
weitoutiaoMap["太能喵"] = weitoutiaoMap["太能喵"]+1 weitoutiaoMap["太能喵"] = weitoutiaoMap["太能喵"]+1
elif content.find("小小")>0: elif contentSubString.find("小小")>0:
weitoutiaoMap["小小"] = weitoutiaoMap["小小"]+1 weitoutiaoMap["小小"] = weitoutiaoMap["小小"]+1
elif content.find("大帝强") > 0: elif contentSubString.find("大帝强") > 0:
weitoutiaoMap["大帝强"] = weitoutiaoMap["大帝强"] + 1 weitoutiaoMap["大帝强"] = weitoutiaoMap["大帝强"] + 1
elif content.find("叶小欢") > 0: elif contentSubString.find("叶小欢") > 0:
weitoutiaoMap["叶小欢"] = weitoutiaoMap["叶小欢"] + 1 weitoutiaoMap["叶小欢"] = weitoutiaoMap["叶小欢"] + 1
getWeiToutiaoInfo() getWeiToutiaoInfo()
"""
获取文章数据信息
"""
def getWenzhangInfo():
global max_behot_time
# 第一步输入这个:去除开头警告
warnings.simplefilter('ignore', ResourceWarning)
chrome_option = ChromeOptions()
chrome_option.headless = True
chrome_option.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
chrome_option.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_option.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
chrome_option.add_argument('--headless')
browser = Chrome(executable_path=driver_path, options=chrome_option)
# 获取上一周的第一天和最后一天的时间戳
lastWeekStartTime, lastWeekEndTime = getLastWeekFirstDayTimeStamp()
session = HTMLSession()
wenzhangGet = session.get(f"https://www.toutiao.com/api/pc/list/user/feed?category=pc_profile_article&token=MS4wLjABAAAA7lHc4sBPuZaQ85qdIrwVvWm8Ps5O1kPMpuh5lTJAwII&max_behot_time={max_behot_time}&aid=24&app_name=toutiao_web", verify=False, proxies=None)
# print(weitoutiaoGet.text)
# 使用Json解析返回的数据
resultJson = json.loads(wenzhangGet.text)
# 先获取下个max_behot_time时间戳如果需要翻页查询需要使用该参数
max_behot_time = resultJson['next']['max_behot_time']
# 开始循环解析data数据获取微头条的内容和发布人员
dataList = resultJson['data']
# 获取每一条消息的发布时间戳如果发布时间出现早于上周一0点的数据结束统计函数返回否则递归调用当前函数
for data in dataList:
# 获取本条新闻的发布时间
publishTime = data["publish_time"]
if int(publishTime) > int(lastWeekEndTime):
continue
elif int(publishTime) < int(lastWeekStartTime):
return
else:
# 获取该文章的标题和对应的url需要再通过url获取文章正文解析作者是谁
title = data["title"]
url = data["url"]
if url:
# 请求文章正文内容
wenzhangGet = browser.get(url)
print(wenzhangGet.text)
# author = None
# if data.find("太能喵")>0:
# weitoutiaoMap["太能喵"] = weitoutiaoMap["太能喵"]+1
# elif content.find("小小")>0:
# weitoutiaoMap["小小"] = weitoutiaoMap["小小"]+1
# elif content.find("大帝强") > 0:
# weitoutiaoMap["大帝强"] = weitoutiaoMap["大帝强"] + 1
# elif content.find("叶小欢") > 0:
# weitoutiaoMap["叶小欢"] = weitoutiaoMap["叶小欢"] + 1
# 如果没有被return掉继续请求下一页内容
getWeiToutiaoInfo()
""" """
获取当前日期上一周的第一天和最后一天的时间戳 获取当前日期上一周的第一天和最后一天的时间戳
""" """
@ -68,5 +130,5 @@ def getLastWeekFirstDayTimeStamp():
return last_week_start.timestamp(), last_week_end.timestamp() return last_week_start.timestamp(), last_week_end.timestamp()
if __name__ == '__main__': if __name__ == '__main__':
getWeiToutiaoInfo() getWenzhangInfo()
print(weitoutiaoMap) # print(weitoutiaoMap)