mirror of
https://github.com/Jerryplusy/rc-plugin.git
synced 2025-10-14 16:19:18 +00:00
✨ feat: 1.0.7 新增自定义对话
1. 新增进阶内容:自定义对话 2. 删除#我靠,将在下一个版本进行重构 3. 整理、优化部分代码
This commit is contained in:
parent
140b3de4bd
commit
12127fad05
23974
test/get-pip.py
23974
test/get-pip.py
File diff suppressed because it is too large
Load Diff
130
test/main.py
130
test/main.py
@ -1,130 +0,0 @@
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
|
||||
import aiohttp
|
||||
import motor
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = ''
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35",
|
||||
'Pragma': 'no-cache',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
"Content-Type": "text/html;charset=utf-8"}
|
||||
|
||||
# 请求分类页面的并发限制(不要一次请求太多分类)
|
||||
sem_page = asyncio.Semaphore(5)
|
||||
|
||||
|
||||
async def fetch_content(url):
|
||||
'''
|
||||
根据url获取内容
|
||||
:param url: 网页的url
|
||||
:return:
|
||||
'''
|
||||
async with sem_page:
|
||||
print(f'开始解析链接:{url}')
|
||||
max_retries = 3
|
||||
attempt = 0
|
||||
while True:
|
||||
try:
|
||||
async with aiohttp.ClientSession(
|
||||
headers=headers, connector=aiohttp.TCPConnector(ssl=False)
|
||||
) as session:
|
||||
async with session.get(url, timeout=10) as resp:
|
||||
return await resp.text()
|
||||
break
|
||||
except (
|
||||
asyncio.TimeoutError
|
||||
):
|
||||
if attempt < max_retries:
|
||||
print("解析链接异常,稍后自动重试:{} times:{}".format(url, attempt))
|
||||
attempt += 1
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
async def page_pic(connect, pic_page):
|
||||
'''
|
||||
处理一套图片,分文件夹存放
|
||||
:param connect: Mongodb连接
|
||||
:param pic_page: 一套图片的url
|
||||
:return:
|
||||
'''
|
||||
bs = BeautifulSoup(pic_page, 'lxml')
|
||||
div = bs.find('div', {'class': "content"})
|
||||
img_temp_link = div.find('img')['src']
|
||||
img_base_link = img_temp_link[0:-5]
|
||||
all_text = bs.find('i', id='allbtn').get_text()
|
||||
pattern = re.compile("\((.*?)\)")
|
||||
total = pattern.search(all_text).group(1).split("/")[1]
|
||||
img_urls = []
|
||||
for i in range(1, int(total) + 1):
|
||||
img_url = img_base_link + str(i) + '.jpg'
|
||||
img_urls.append(img_url)
|
||||
task = [insert_url(connect, url_temp) for url_temp in img_urls]
|
||||
await asyncio.gather(*task)
|
||||
|
||||
|
||||
async def page_main(url):
|
||||
'''
|
||||
单页下载
|
||||
:param url: 单页的url
|
||||
:return: None
|
||||
'''
|
||||
connect = build_connect()
|
||||
main_page_text = await fetch_content(url)
|
||||
bs = BeautifulSoup(main_page_text, 'lxml')
|
||||
a_all = bs.find_all('a', {'class': 'pic'})
|
||||
page_urls = []
|
||||
for a in a_all:
|
||||
page_urls.append(a['href'])
|
||||
tasks = [fetch_content(pic_url) for pic_url in page_urls]
|
||||
pic_pages = await asyncio.gather(*tasks)
|
||||
pics = [page_pic(connect, pic_page) for pic_page in pic_pages]
|
||||
await asyncio.gather(*pics)
|
||||
|
||||
|
||||
async def main():
|
||||
'''
|
||||
遍历所有页面的url进行下载
|
||||
:return:
|
||||
'''
|
||||
# await target_folder()
|
||||
start = time.time()
|
||||
mainPageText = await fetch_content(url)
|
||||
bs = BeautifulSoup(mainPageText, 'lxml')
|
||||
page_count = bs.find('div', {'class', 'page'}).find('a', {'class', "end"}).get_text()
|
||||
page_urls = []
|
||||
for i in range(1, int(page_count) + 1):
|
||||
page_url = f'{url}list_{i}.html'
|
||||
page_urls.append(page_url)
|
||||
tasks = [page_main(page_url) for page_url in page_urls]
|
||||
await asyncio.gather(*tasks)
|
||||
end = time.time()
|
||||
print(f"耗时:{end - start:.2f}秒")
|
||||
|
||||
|
||||
# 单例建立MongoDB连接
|
||||
def build_connect():
|
||||
client = motor.motor_tornado.MotorClient('localhost', 27017)
|
||||
db = client.test
|
||||
return db.temp
|
||||
|
||||
|
||||
def insert_url(connect, url_temp):
|
||||
query = {'url': url_temp}
|
||||
return connect.update_one(query, {"$set": query}, True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# linux下用此方法
|
||||
# asyncio.run(main())
|
||||
# windows下上面的方法会报错(会在运行完成后报错,不影响下载),可以换成用下面这两行
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(main())
|
@ -1,5 +0,0 @@
|
||||
aiohttp~=3.8.3
|
||||
motor~=3.1.1
|
||||
beautifulsoup4~=4.11.1
|
||||
pymongo~=4.3.2
|
||||
tornado~=6.2
|
Loading…
x
Reference in New Issue
Block a user