diff --git a/.github/workflows/test-web-funcs.yml b/.github/workflows/test-web-funcs.yml
index 37f2cdd9a..3c1d05c13 100644
--- a/.github/workflows/test-web-funcs.yml
+++ b/.github/workflows/test-web-funcs.yml
@@ -48,12 +48,12 @@ jobs:
- name: Public IP
id: ip
uses: haythem/public-ip@v1.3
- - name: Test proxyfree.py
- run: |
- poetry run pytest unittest/test_proxyfree.py
- name: Test web crawlers
run: |
poetry run pytest unittest/test_crawlers.py
+ - name: Test proxyfree.py
+ run: |
+ poetry run pytest unittest/test_proxyfree.py
- name: Upload log as artifact
uses: actions/upload-artifact@v4
if: ${{ always() }}
diff --git a/config.yml b/config.yml
index 53fac4863..7d8790195 100644
--- a/config.yml
+++ b/config.yml
@@ -25,16 +25,24 @@ network:
# 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080'
# null表示禁用代理
proxy_server: null
- # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
- proxy_free:
- avsox: 'https://avsox.click'
- javbus: 'https://www.seedmm.help'
- javdb: 'https://javdb368.com'
- javlib: 'https://www.y78k.com'
# 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了
- retry: 3
+ retries: 3
# https://en.wikipedia.org/wiki/ISO_8601#Durations
timeout: PT10S
+ # 对列表中的地址不使用梯子(如果启用了的话)
+ unproxied: [
+ 'https://www.seedmm.help',
+ 'https://javdb368.com',
+ 'https://www.y78k.com',
+ 'https://www.javbus.one',
+ 'https://www.tellme.pw',
+ ]
+ # 各个站点的代替地址。
+ # JavSP会按顺序尝试列表里的每一个服务器,如果都不行会使用默认的主站点地址
+ fallback:
+ javbus: ['https://www.seedmm.help']
+ javdb: ['https://javdb368.com']
+ javlib: ['https://www.y78k.com']
################################
crawler:
@@ -52,8 +60,6 @@ crawler:
hardworking: true
# 使用网页番号作为最终番号(启用时会对番号大小写等进行更正)
respect_site_avid: true
- # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件
- fc2fan_local_path: null
# 刮削一部电影后的等待时间(设置为0禁用此功能)
# https://en.wikipedia.org/wiki/ISO_8601#Durations
sleep_after_scraping: PT1S
diff --git a/javsp/__main__.py b/javsp/__main__.py
index 7771170e7..a7f407f99 100644
--- a/javsp/__main__.py
+++ b/javsp/__main__.py
@@ -3,13 +3,15 @@
import sys
import json
import time
+import asyncio
import logging
from PIL import Image
from pydantic import ValidationError
+from pydantic_core import Url
from pydantic_extra_types.pendulum_dt import Duration
-import requests
-import threading
-from typing import Dict, List
+from typing import Any, Coroutine, Dict, List
+from javsp.crawlers.all import crawlers
+from javsp.network.client import clear_clients
sys.stdout.reconfigure(encoding='utf-8')
@@ -23,7 +25,7 @@
from javsp.print import TqdmOut
-from javsp.cropper import Cropper, get_cropper
+from javsp.cropper import get_cropper
# 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作
@@ -41,11 +43,11 @@
from javsp.func import *
from javsp.image import *
from javsp.datatype import Movie, MovieInfo
-from javsp.web.base import download
-from javsp.web.exceptions import *
-from javsp.web.translate import translate_movie_info
+from javsp.network.utils import url_download
+from javsp.crawlers.exceptions import *
+from javsp.translate import translate_movie_info
-from javsp.config import Cfg, CrawlerID
+from javsp.config import Cfg, CrawlerID, UseJavDBCover
actressAliasMap = {}
@@ -57,86 +59,49 @@ def resolve_alias(name):
return name # 如果找不到别名对应的固定名字,则返回原名
-def import_crawlers():
- """按配置文件的抓取器顺序将该字段转换为抓取器的函数列表"""
- unknown_mods = []
- for _, mods in Cfg().crawler.selection.items():
- valid_mods = []
- for name in mods:
- try:
- # 导入fc2fan抓取器的前提: 配置了fc2fan的本地路径
- # if name == 'fc2fan' and (not os.path.isdir(Cfg().Crawler.fc2fan_local_path)):
- # logger.debug('由于未配置有效的fc2fan路径,已跳过该抓取器')
- # continue
- import_name = 'javsp.web.' + name
- __import__(import_name)
- valid_mods.append(import_name) # 抓取器有效: 使用完整模块路径,便于程序实际使用
- except ModuleNotFoundError:
- unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示
- if unknown_mods:
- logger.warning('配置的抓取器无效: ' + ', '.join(unknown_mods))
-
-
# 爬虫是IO密集型任务,可以通过多线程提升效率
-def parallel_crawler(movie: Movie, tqdm_bar=None):
+async def parallel_crawler(movie: Movie, tqdm_bar=None) -> dict[CrawlerID, MovieInfo]:
"""使用多线程抓取不同网站的数据"""
- def wrapper(parser, info: MovieInfo, retry):
+
+ async def wrapper(id: CrawlerID, movie: MovieInfo) -> None:
"""对抓取器函数进行包装,便于更新提示信息和自动重试"""
- crawler_name = threading.current_thread().name
- task_info = f'Crawler: {crawler_name}: {info.dvdid}'
- for cnt in range(retry):
- try:
- parser(info)
- movie_id = info.dvdid or info.cid
- logger.debug(f"{crawler_name}: 抓取成功: '{movie_id}': '{info.url}'")
- setattr(info, 'success', True)
- if isinstance(tqdm_bar, tqdm):
- tqdm_bar.set_description(f'{crawler_name}: 抓取完成')
- break
- except MovieNotFoundError as e:
- logger.debug(e)
- break
- except MovieDuplicateError as e:
- logger.exception(e)
- break
- except (SiteBlocked, SitePermissionError, CredentialError) as e:
- logger.error(e)
- break
- except requests.exceptions.RequestException as e:
- logger.debug(f'{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}')
- if isinstance(tqdm_bar, tqdm):
- tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试')
- except Exception as e:
- logger.exception(e)
+ try:
+ crawler = await crawlers[id].create()
+ await crawler.crawl_and_fill(movie)
+ movie_id = info.dvdid or info.cid
+ logger.debug(f"{crawler.id.value}: 抓取成功: '{movie_id}': '{info.url}'")
+ setattr(info, 'success', True)
+ if isinstance(tqdm_bar, tqdm):
+ tqdm_bar.set_description(f'{crawler.id.value}: 抓取完成')
+ except MovieNotFoundError as e:
+ logger.debug(e)
+ except MovieDuplicateError as e:
+ logger.exception(e)
+ except (SiteBlocked, SitePermissionError, CredentialError) as e:
+ logger.error(e)
+ except Exception as e:
+ logger.exception(e)
# 根据影片的数据源获取对应的抓取器
- crawler_mods: List[CrawlerID] = Cfg().crawler.selection[movie.data_src]
+ crawler_to_use: List[CrawlerID] = Cfg().crawler.selection[movie.data_src]
+
+ all_info: Dict[CrawlerID, MovieInfo] = {i: MovieInfo(movie) for i in crawler_to_use}
- all_info = {i.value: MovieInfo(movie) for i in crawler_mods}
# 番号为cid但同时也有有效的dvdid时,也尝试使用普通模式进行抓取
if movie.data_src == 'cid' and movie.dvdid:
- crawler_mods = crawler_mods + Cfg().crawler.selection.normal
+ crawler_to_use += Cfg().crawler.selection.normal
for i in all_info.values():
i.dvdid = None
for i in Cfg().crawler.selection.normal:
all_info[i] = MovieInfo(movie.dvdid)
- thread_pool = []
- for mod_partial, info in all_info.items():
- mod = f"javsp.web.{mod_partial}"
- parser = getattr(sys.modules[mod], 'parse_data')
- # 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新
- # TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1
- if hasattr(sys.modules[mod], 'parse_data_raw'):
- th = threading.Thread(target=wrapper, name=mod, args=(parser, info, 1))
- else:
- th = threading.Thread(target=wrapper, name=mod, args=(parser, info, Cfg().network.retry))
- th.start()
- thread_pool.append(th)
- # 等待所有线程结束
- timeout = Cfg().network.retry * Cfg().network.timeout.total_seconds()
- for th in thread_pool:
- th: threading.Thread
- th.join(timeout=timeout)
+
+ co_pool: list[Coroutine[Any, Any, None]] = []
+ for crawler_id, info in all_info.items():
+ co_pool.append(wrapper(crawler_id, info))
+
+ # 等待所有协程结束
+ await asyncio.gather(*co_pool)
+
# 根据抓取结果更新影片类型判定
if movie.data_src == 'cid' and movie.dvdid:
titles = [all_info[i].title for i in Cfg().crawler.selection[movie.data_src]]
@@ -148,22 +113,22 @@ def wrapper(parser, info: MovieInfo, retry):
movie.data_src = 'normal'
movie.cid = None
all_info = {k: v for k, v in all_info.items() if k not in Cfg().crawler.selection['cid']}
+
# 删除抓取失败的站点对应的数据
all_info = {k:v for k,v in all_info.items() if hasattr(v, 'success')}
for info in all_info.values():
del info.success
- # 删除all_info中键名中的'web.'
- all_info = {k[4:]:v for k,v in all_info.items()}
+
return all_info
-def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
+def info_summary(movie: Movie, all_info: Dict[CrawlerID, MovieInfo]):
"""汇总多个来源的在线数据生成最终数据"""
final_info = MovieInfo(movie)
########## 部分字段配置了专门的选取逻辑,先处理这些字段 ##########
# genre
- if 'javdb' in all_info and all_info['javdb'].genre:
- final_info.genre = all_info['javdb'].genre
+ if 'javdb' in all_info and all_info[CrawlerID.javdb].genre:
+ final_info.genre = all_info[CrawlerID.javdb].genre
########## 移除所有抓取器数据中,标题尾部的女优名 ##########
if Cfg().summarizer.title.remove_trailing_actor_name:
@@ -197,7 +162,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
setattr(final_info, attr, incoming)
absorbed.append(attr)
if absorbed:
- logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed))
+ logger.debug(f"从'{name.value}'中获取了字段: " + ' '.join(absorbed))
# 使用网站的番号作为番号
if Cfg().crawler.respect_site_avid:
id_weight = {}
@@ -216,7 +181,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
else:
final_info.cid = final_id
# javdb封面有水印,优先采用其他站点的封面
- javdb_cover = getattr(all_info.get('javdb'), 'cover', None)
+ javdb_cover = getattr(all_info.get(CrawlerID.javdb), 'cover', None)
if javdb_cover is not None:
match Cfg().crawler.use_javdb_cover:
case UseJavDBCover.fallback:
@@ -402,7 +367,7 @@ def should_use_ai_crop_match(label):
fanart_cropped = add_label_to_poster(fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT)
fanart_cropped.save(movie.poster_file)
-def RunNormalMode(all_movies):
+async def RunNormalMode(all_movies):
"""普通整理模式"""
def check_step(result, msg='步骤错误'):
"""检查一个整理步骤的结果,并负责更新tqdm的进度"""
@@ -427,7 +392,7 @@ def check_step(result, msg='步骤错误'):
inner_bar = tqdm(total=total_step, desc='步骤', ascii=True, leave=False)
# 依次执行各个步骤
inner_bar.set_description(f'启动并发任务')
- all_info = parallel_crawler(movie, inner_bar)
+ all_info = await parallel_crawler(movie, inner_bar)
msg = f'为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息'
check_step(all_info, msg)
@@ -447,9 +412,9 @@ def check_step(result, msg='步骤错误'):
inner_bar.set_description('下载封面图片')
if Cfg().summarizer.cover.highres:
- cover_dl = download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers)
+ cover_dl = await download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers)
else:
- cover_dl = download_cover(movie.info.covers, movie.fanart_file)
+ cover_dl = await download_cover(movie.info.covers, movie.fanart_file)
check_step(cover_dl, '下载封面图片失败')
cover, pic_path = cover_dl
# 确保实际下载的封面的url与即将写入到movie.info中的一致
@@ -476,12 +441,12 @@ def check_step(result, msg='步骤错误'):
fanart_destination = f"{extrafanartdir}/{id}.png"
try:
- info = download(pic_url, fanart_destination)
+ info = await url_download(Url(pic_url), fanart_destination)
if valid_pic(fanart_destination):
filesize = get_fmt_size(pic_path)
width, height = get_pic_size(pic_path)
- elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed']))
- speed = get_fmt_size(info['rate']) + '/s'
+ elapsed = str(info.elapsed)
+ speed = f"{info.get_rate()}Mbps"
logger.info(f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]")
else:
check_step(False, f"下载剧照{id}: {pic_url}失败")
@@ -512,38 +477,29 @@ def check_step(result, msg='步骤错误'):
return return_movies
-def download_cover(covers, fanart_path, big_covers=[]):
+async def download_cover(covers, fanart_path, big_covers=[]):
"""下载封面图片"""
# 优先下载高清封面
for url in big_covers:
pic_path = get_pic_path(fanart_path, url)
- for _ in range(Cfg().network.retry):
- try:
- info = download(url, pic_path)
- if valid_pic(pic_path):
- filesize = get_fmt_size(pic_path)
- width, height = get_pic_size(pic_path)
- elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed']))
- speed = get_fmt_size(info['rate']) + '/s'
- logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]")
- return (url, pic_path)
- except requests.exceptions.HTTPError:
- # HTTPError通常说明猜测的高清封面地址实际不可用,因此不再重试
- break
+ info = await url_download(Url(url), pic_path)
+ if valid_pic(pic_path):
+ filesize = get_fmt_size(pic_path)
+ width, height = get_pic_size(pic_path)
+ elapsed = str(info.elapsed)
+ speed = f"{info.get_rate()}Mbps"
+ logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]")
+ return (url, pic_path)
# 如果没有高清封面或高清封面下载失败
for url in covers:
pic_path = get_pic_path(fanart_path, url)
- for _ in range(Cfg().network.retry):
- try:
- download(url, pic_path)
- if valid_pic(pic_path):
- logger.debug(f"已下载封面: '{url}'")
- return (url, pic_path)
- else:
- logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址")
- break
- except Exception as e:
- logger.debug(e, exc_info=True)
+ await url_download(Url(url), pic_path)
+ if valid_pic(pic_path):
+ logger.debug(f"已下载封面: '{url}'")
+ return (url, pic_path)
+ else:
+ logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址")
+ break
logger.error(f"下载封面图片失败")
logger.debug('big_covers:'+str(big_covers) + ', covers'+str(covers))
return None
@@ -558,14 +514,7 @@ def get_pic_path(fanart_path, url):
pic_path = fanart_base + "." + pic_extend
return pic_path
-def error_exit(success, err_info):
- """检查业务逻辑是否成功完成,如果失败则报错退出程序"""
- if not success:
- logger.error(err_info)
- sys.exit(1)
-
-
-def entry():
+async def aentry():
try:
Cfg()
except ValidationError as e:
@@ -583,22 +532,30 @@ def entry():
# 检查更新
version_info = 'JavSP ' + getattr(sys, 'javsp_version', '未知版本/从代码运行')
logger.debug(version_info.center(60, '='))
- check_update(Cfg().other.check_update, Cfg().other.auto_update)
+ await check_update(Cfg().other.check_update, Cfg().other.auto_update)
root = get_scan_dir(Cfg().scanner.input_directory)
- error_exit(root, '未选择要扫描的文件夹')
+ if root is None:
+ logger.error('未选择要扫描的文件夹')
+ sys.exit(1)
# 导入抓取器,必须在chdir之前
- import_crawlers()
os.chdir(root)
print(f'扫描影片文件...')
recognized = scan_movies(root)
movie_count = len(recognized)
recognize_fail = []
- error_exit(movie_count, '未找到影片文件')
+ if movie_count == 0:
+ logger.error('未找到影片文件')
+ sys.exit(1)
logger.info(f'扫描影片文件:共找到 {movie_count} 部影片')
- RunNormalMode(recognized + recognize_fail)
+ await RunNormalMode(recognized + recognize_fail)
+
+ await clear_clients()
sys.exit(0)
+def entry():
+ asyncio.run(aentry(), debug=True)
+
if __name__ == "__main__":
entry()
diff --git a/javsp/chromium.py b/javsp/chromium.py
index db315293e..1f8d01964 100644
--- a/javsp/chromium.py
+++ b/javsp/chromium.py
@@ -32,6 +32,8 @@ def decrypt(self, encrypted_value):
def get_browsers_cookies():
"""获取系统上的所有Chromium系浏览器的JavDB的Cookies"""
+ if not sys.platform.startswith('win32'): # 不支持windows以外的系统
+ return []
# 不予支持: Opera, 360安全&极速, 搜狗使用非标的用户目录或数据格式; QQ浏览器屏蔽站点
user_data_dirs = {
'Chrome': '/Google/Chrome/User Data',
diff --git a/javsp/config.py b/javsp/config.py
index 3fbc8f071..e87b5dc28 100644
--- a/javsp/config.py
+++ b/javsp/config.py
@@ -39,9 +39,10 @@ class CrawlerID(str, Enum):
class Network(BaseConfig):
proxy_server: Url | None
- retry: NonNegativeInt = 3
+ retries: NonNegativeInt = 3
timeout: Duration
- proxy_free: Dict[CrawlerID, Url]
+ unproxied: List[Url]
+ fallback: Dict[CrawlerID, List[str]]
class CrawlerSelect(BaseConfig):
def items(self) -> List[tuple[str, list[CrawlerID]]]:
@@ -109,7 +110,6 @@ class Crawler(BaseConfig):
required_keys: list[MovieInfoField]
hardworking: bool
respect_site_avid: bool
- fc2fan_local_path: Path | None
sleep_after_scraping: Duration
use_javdb_cover: UseJavDBCover
normalize_actress_name: bool
diff --git a/javsp/crawlers/all.py b/javsp/crawlers/all.py
new file mode 100644
index 000000000..8c262ecc1
--- /dev/null
+++ b/javsp/crawlers/all.py
@@ -0,0 +1,30 @@
+from collections.abc import Coroutine
+from typing import Any, Dict
+from javsp.config import CrawlerID
+from javsp.crawlers.interface import Crawler
+from javsp.crawlers.sites import \
+ airav, arzon, arzon_iv, avsox, avwiki, dl_getchu, fanza, fc2, fc2ppvdb, \
+ gyutto, jav321, javbus, javdb, javlib, javmenu, mgstage, njav, prestige
+
+__all__ = ['crawlers']
+
+crawlers: Dict[CrawlerID, type[Crawler]] = {
+ CrawlerID.airav: airav. AiravCrawler,
+ CrawlerID.arzon: arzon. ArzonCrawler,
+ CrawlerID.arzon_iv: arzon_iv. ArzonIvCrawler,
+ CrawlerID.avsox: avsox. AvsoxCrawler,
+ CrawlerID.avwiki: avwiki. AvWikiCrawler,
+ CrawlerID.dl_getchu: dl_getchu.DlGetchuCrawler,
+ CrawlerID.fanza: fanza. FanzaCrawler,
+ CrawlerID.fc2: fc2. Fc2Crawler,
+ CrawlerID.fc2ppvdb: fc2ppvdb. Fc2PpvDbCrawler,
+ CrawlerID.gyutto: gyutto. GyuttoCrawler,
+ CrawlerID.jav321: jav321. Jav321Crawler,
+ CrawlerID.javbus: javbus. JavbusCrawler,
+ CrawlerID.javdb: javdb. JavDbCrawler,
+ CrawlerID.javlib: javlib. JavLibCrawler,
+ CrawlerID.javmenu: javmenu. JavMenuCrawler,
+ CrawlerID.mgstage: mgstage. MgstageCrawler,
+ CrawlerID.njav: njav. NjavCrawler,
+ CrawlerID.prestige: prestige. PrestigeCrawler,
+}
diff --git a/javsp/web/exceptions.py b/javsp/crawlers/exceptions.py
similarity index 100%
rename from javsp/web/exceptions.py
rename to javsp/crawlers/exceptions.py
diff --git a/javsp/crawlers/interface.py b/javsp/crawlers/interface.py
new file mode 100644
index 000000000..c82085554
--- /dev/null
+++ b/javsp/crawlers/interface.py
@@ -0,0 +1,21 @@
+from javsp.config import CrawlerID
+from javsp.datatype import MovieInfo
+from abc import ABC, abstractmethod
+from typing import Self
+from aiohttp import ClientSession
+
+
+class Crawler(ABC):
+ base_url: str
+ client: ClientSession
+ id: CrawlerID
+
+
+ @classmethod
+ @abstractmethod
+ async def create(cls) -> Self:
+ pass
+
+ @abstractmethod
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ pass
diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py
new file mode 100644
index 000000000..45da59b94
--- /dev/null
+++ b/javsp/crawlers/proxyfree.py
@@ -0,0 +1,98 @@
+"""获取各个网站的免代理地址"""
+from collections.abc import Callable, Coroutine
+import re
+from typing import Any, Dict
+
+from pydantic_core import Url
+from pydantic_extra_types.pendulum_dt import Duration
+from lxml import html
+
+from javsp.config import CrawlerID
+from javsp.network.utils import test_connect, choose_one_connectable
+from javsp.network.client import get_session
+
+
+async def _get_avsox_urls() -> list[str]:
+ link = 'https://tellme.pw/avsox'
+ s = get_session(Url(link))
+ resp = await s.get(link)
+ tree = html.fromstring(await resp.text())
+ urls = tree.xpath('//h4/strong/a/@href')
+ return urls
+
+
+async def _get_javbus_urls() -> list[str]:
+ link = 'https://www.javbus.one/'
+ s = get_session(Url(link))
+ resp = await s.get(link)
+ text = await resp.text()
+ urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
+ return urls
+
+
+async def _get_javlib_urls() -> list[str]:
+ link = 'https://github.com/javlibcom'
+ s = get_session(Url(link))
+ resp = await s.get(link)
+ tree = html.fromstring(await resp.text())
+ text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
+ match = re.search(r'[\w\.]+', text, re.A)
+ if match:
+ domain = f'https://www.{match.group(0)}.com'
+ return [domain]
+ return []
+
+
+async def _get_javdb_urls() -> list[str]:
+ root_link = 'https://jav524.app'
+ s = get_session(Url(root_link))
+ resp = await s.get(root_link)
+ tree = html.fromstring(await resp.text())
+ js_links = tree.xpath("//script[@src]/@src")
+ for link in js_links:
+ if '/js/index' in link:
+ link = root_link + link
+ resp = await s.get(link)
+ text = await resp.text()
+ match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
+ if match:
+ return [match.group(1)]
+ return []
+
+proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= {
+ CrawlerID.avsox: _get_avsox_urls,
+ CrawlerID.javdb: _get_javdb_urls,
+ CrawlerID.javbus: _get_javbus_urls,
+ CrawlerID.javlib: _get_javlib_urls,
+}
+
+async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None) -> str | None:
+ """获取指定网站的免代理地址
+ Args:
+ site_name (str): 站点名称
+ prefer_url (str, optional): 优先测试此url是否可用
+ Returns:
+ str: 指定站点的免代理地址(失败时为空字符串)
+ """
+ if prefer_url and await test_connect(prefer_url, Duration(seconds=5)):
+ return prefer_url
+
+ if site_name in proxy_free_fns:
+ try:
+ urls = await proxy_free_fns[site_name]()
+ return await choose_one_connectable(urls)
+ except:
+ return None
+ else:
+ raise Exception("Dont't know how to get proxy-free url for " + site_name)
+
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ print('javdb:\t', await _get_javdb_urls())
+ print('javlib:\t', await _get_javlib_urls())
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/airav.py b/javsp/crawlers/sites/airav.py
new file mode 100644
index 000000000..00c0503b9
--- /dev/null
+++ b/javsp/crawlers/sites/airav.py
@@ -0,0 +1,129 @@
+"""从airav抓取数据"""
+import re
+from html import unescape
+from typing import Dict
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.network.client import get_session
+from javsp.network.utils import resolve_site_fallback
+from javsp.config import Cfg, CrawlerID
+from javsp.datatype import MovieInfo
+from javsp.crawlers.interface import Crawler
+
+
+class AiravCrawler(Crawler):
+ id = CrawlerID.airav
+
+ headers: Dict[str, str]
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.airav.wiki')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ self.headers = {'Accept-Language': 'zh-TW,zh;q=0.9'}
+ return self
+
+ async def search_movie(self, dvdid: str):
+ """通过搜索番号获取指定的影片在网站上的ID"""
+ # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片
+ page = 0
+ count = 1
+ result = []
+ while len(result) < count:
+ url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
+ response = await self.client.get(url, headers=self.headers)
+ resp = await response.json()
+ # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
+ if resp['result']:
+ result.extend(resp['result'])
+ count = resp['count']
+ page += 1
+ else: # 结果为空,结束循环
+ break
+ # 如果什么都没搜索到,直接返回
+ if not result:
+ raise MovieNotFoundError(__name__, dvdid)
+ # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472')
+ result.sort(key=lambda x:x['barcode'])
+ # 从所有搜索结果中选择最可能的番号,返回它的URL
+ target = dvdid.replace('-', '_')
+ for item in result:
+ # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''}
+ barcode = item['barcode'].replace('-', '_')
+ if target in barcode:
+ return item['barcode']
+ raise MovieNotFoundError(__name__, dvdid, result)
+
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
+ url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
+ response = await self.client.get(url, headers=self.headers)
+ resp_json = await response.json()
+ # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
+ if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
+ barcode = await self.search_movie(movie.dvdid)
+ if barcode:
+ url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW'
+ response = await self.client.get(url, headers=self.headers)
+ resp_json = await response.json()
+
+ if resp_json['count'] == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid, resp_json)
+
+ # 从API返回的数据中提取需要的字段
+ # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
+ data = resp_json['result']
+ dvdid = data['barcode']
+ movie.dvdid = dvdid
+ movie.url = self.base_url + 'video/' + dvdid
+ # plot和title中可能含有HTML的转义字符,需要进行解转义处理
+ movie.plot = unescape(data['description']) or None
+ movie.cover = data['img_url']
+ # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
+ movie.genre = [i['name'] for i in data['tags']]
+ movie.title = unescape(data['name'])
+ movie.actress = [i['name'] for i in data['actors']]
+ movie.publish_date = data['publish_date']
+ movie.preview_pics = data['images'] or []
+ if data['factories']:
+ movie.producer = data['factories'][0]['name']
+
+ if Cfg().crawler.hardworking:
+ # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
+ video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
+ response = await self.client.get(video_url, headers=self.headers)
+ resp = await response.json()
+ # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
+ if 'data' in resp:
+ # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
+ # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
+ movie.preview_video = resp['data'].get('url')
+
+ # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确
+ for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'):
+ if movie.title and keyword in movie.title:
+ movie.title = None
+ movie.genre = []
+ if movie.plot and keyword in movie.plot:
+ movie.plot = None
+ movie.genre = []
+ if not any([movie.title, movie.plot, movie.genre]):
+ break
+
+if __name__ == "__main__":
+ from javsp.network.client import clear_clients
+
+ async def test_main():
+ crawler = await AiravCrawler.create()
+ movie = MovieInfo("DSAD-938")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ await clear_clients()
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/arzon.py b/javsp/crawlers/sites/arzon.py
new file mode 100644
index 000000000..6fb868cc4
--- /dev/null
+++ b/javsp/crawlers/sites/arzon.py
@@ -0,0 +1,107 @@
+"""从arzon抓取数据"""
+import re
+
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.crawlers.exceptions import *
+from javsp.datatype import MovieInfo
+from lxml import html
+
+class ArzonCrawler(Crawler):
+ id = CrawlerID.arzon
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, "https://www.arzon.jp")
+ self.base_url = str(url)
+ self.client = get_session(url)
+ # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
+ skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1"
+ await self.client.get(skip_verify_url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ full_id = movie.dvdid
+ url = f'{self.base_url}/itemlist.html?t=&m=all&s=&q={full_id}'
+ # url = f'{base_url}/imagelist.html?q={full_id}'
+
+ r = await self.client.get(url)
+ if r.status == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
+ data = html.fromstring(await r.read())
+
+ urls = data.xpath("//h2/a/@href")
+ if len(urls) == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ item_url = self.base_url[:-1] + urls[0]
+ e = await self.client.get(item_url)
+ item = html.fromstring(await e.read())
+
+ title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0]
+ cover = item.xpath("//td[@align='center']//a/img/@src")[0]
+ item_text = item.xpath("//div[@class='item_text']/text()")
+ plot = [item.strip() for item in item_text if item.strip() != ''][0]
+ preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src")
+ # 使用列表推导式添加 "http:" 并去除 "m_"
+ preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr]
+
+ container = item.xpath("//div[@class='item_register']/table//tr")
+ for row in container:
+ key = row.xpath("./td[1]/text()")[0]
+ contents = row.xpath("./td[2]//text()")
+ content = [item.strip() for item in contents if item.strip() != '']
+ index = 0
+ value = content[index] if content and index < len(content) else None
+ if key == "AV女優:":
+ movie.actress = content
+ if key == "AVメーカー:":
+ movie.producer = value
+ if key == "AVレーベル:":
+ video_type = value
+ if key == "シリーズ:":
+ movie.serial = value
+ if key == "監督:":
+ movie.director = value
+ if key == "発売日:" and value:
+ movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
+ if key == "収録時間:" and value:
+ movie.duration = re.search(r'([\d.]+)分', value).group(1)
+ if key == "品番:":
+ dvd_id = value
+ elif key == "タグ:":
+ genre = value
+
+ genres = ''
+ if video_type:
+ genres = [video_type]
+ if(genre != None):
+ genres.append(genre)
+
+ movie.genre = genres
+ movie.url = item_url
+ movie.title = title
+ movie.plot = plot
+ movie.cover = f'https:{cover}'
+ movie.preview_pics = preview_pics
+
+if __name__ == "__main__":
+ from javsp.network.client import clear_clients
+
+ async def test_main():
+ crawler = await ArzonCrawler.create()
+ movie = MovieInfo("CSCT-011")
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ await clear_clients()
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/arzon_iv.py b/javsp/crawlers/sites/arzon_iv.py
new file mode 100644
index 000000000..40b763b7f
--- /dev/null
+++ b/javsp/crawlers/sites/arzon_iv.py
@@ -0,0 +1,100 @@
+"""从arzon_iv抓取数据"""
+import re
+
+
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.crawlers.exceptions import *
+from javsp.datatype import MovieInfo
+from lxml import html
+
+class ArzonIvCrawler(Crawler):
+ id = CrawlerID.arzon_iv
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, "https://www.arzon.jp")
+ self.base_url = str(url)
+ self.client = get_session(url)
+ # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
+ skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1"
+ await self.client.get(skip_verify_url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ full_id = movie.dvdid
+ url = f'{self.base_url}/imagelist.html?q={full_id}'
+ # url = f'{base_url}/imagelist.html?q={full_id}'
+
+ r = await self.client.get(url)
+ if r.status == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
+ data = html.fromstring(await r.read())
+
+ urls = data.xpath("//h2/a/@href")
+ if len(urls) == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ item_url = self.base_url[:-1] + urls[0]
+ e = await self.client.get(item_url)
+ item = html.fromstring(await e.read())
+
+ title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0]
+ cover = item.xpath("//td[@align='center']//a/img/@src")[0]
+ item_text = item.xpath("//div[@class='item_text']/text()")
+ plot = [item.strip() for item in item_text if item.strip() != ''][0]
+
+ container = item.xpath("//div[@class='item_register']/table//tr")
+ for row in container:
+ key = row.xpath("./td[1]/text()")[0]
+ contents = row.xpath("./td[2]//text()")
+ content = [item.strip() for item in contents if item.strip() != '']
+ index = 0
+ value = content[index] if content and index < len(content) else None
+ if key == "タレント:":
+ movie.actress = content
+ if key == "イメージメーカー:":
+ movie.producer = value
+ if key == "イメージレーベル:":
+ video_type = value
+ if key == "監督:":
+ movie.director = value
+ if key == "発売日:" and value:
+ movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
+ if key == "収録時間:" and value:
+ movie.duration = re.search(r'([\d.]+)分', value).group(1)
+ if key == "品番:":
+ dvd_id = value
+ elif key == "タグ:":
+ genre = value
+
+ genres = ''
+ if video_type:
+ genres = [video_type]
+ if(genre != None):
+ genres.append(genre)
+
+ movie.genre = genres
+ movie.url = item_url
+ movie.title = title
+ movie.plot = plot
+ movie.cover = f'https:{cover}'
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await ArzonIvCrawler.create()
+ movie = MovieInfo("KIDM-1137B")
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/avsox.py b/javsp/crawlers/sites/avsox.py
new file mode 100644
index 000000000..75dcd67c2
--- /dev/null
+++ b/javsp/crawlers/sites/avsox.py
@@ -0,0 +1,86 @@
+"""从avsox抓取数据"""
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+class AvsoxCrawler(Crawler):
+ id = CrawlerID.avsox
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, "https://avsox.click/")
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ full_id: str = movie.dvdid
+ if full_id.startswith('FC2-'):
+ full_id = full_id.replace('FC2-', 'FC2-PPV-')
+ resp = await self.client.get(f'{self.base_url}tw/search/{full_id}')
+ tree = html.fromstring(await resp.text())
+ tree.make_links_absolute(str(resp.url), resolve_base_href=True)
+ ids = tree.xpath("//div[@class='photo-info']/span/date[1]/text()")
+ urls = tree.xpath("//a[contains(@class, 'movie-box')]/@href")
+ ids_lower = list(map(str.lower, ids))
+ if full_id.lower() not in ids_lower:
+ raise MovieNotFoundError(__name__, movie.dvdid, ids)
+
+ url = urls[ids_lower.index(full_id.lower())]
+ url = url.replace('/tw/', '/cn/', 1)
+
+ # 提取影片信息
+ resp = await self.client.get(url)
+ tree = html.fromstring(await resp.text())
+ container = tree.xpath("/html/body/div[@class='container']")[0]
+ title = container.xpath("h3/text()")[0]
+ cover = container.xpath("//a[@class='bigImage']/@href")[0]
+ info = container.xpath("div/div[@class='col-md-3 info']")[0]
+ dvdid = info.xpath("p/span[@style]/text()")[0]
+ publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
+ duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip()
+ producer, serial = None, None
+ producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
+ if producer_tag:
+ producer = producer_tag[0].text_content()
+ serial_tag = info.xpath("p[text()='系列:']")
+ if serial_tag:
+ serial = serial_tag[0].getnext().xpath("a/text()")[0]
+ genre = info.xpath("p/span[@class='genre']/a/text()")
+ actress = container.xpath("//a[@class='avatar-box']/span/text()")
+
+ movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-')
+ movie.url = url
+ movie.title = title.replace(dvdid, '').strip()
+ movie.cover = cover
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.genre = genre
+ movie.actress = actress
+ if full_id.startswith('FC2-'):
+ # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整
+ movie.producer = serial
+ else:
+ movie.producer = producer
+ movie.serial = serial
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await AvsoxCrawler.create()
+ movie = MovieInfo("082713-417")
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/avwiki.py b/javsp/crawlers/sites/avwiki.py
new file mode 100644
index 000000000..ec4b3adf9
--- /dev/null
+++ b/javsp/crawlers/sites/avwiki.py
@@ -0,0 +1,82 @@
+"""从av-wiki抓取数据"""
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.crawlers.interface import Crawler
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.config import CrawlerID
+from lxml import html
+
+class AvWikiCrawler(Crawler):
+ id = CrawlerID.avwiki
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://av-wiki.net')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ movie.url = url = f'{self.base_url}{movie.dvdid}'
+
+ resp = await self.client.get(url)
+ if resp.status == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ tree = html.fromstring(await resp.text())
+
+ cover_tag = tree.xpath("//header/div/a[@class='image-link-border']/img")
+ if cover_tag:
+ try:
+ srcset = cover_tag[0].get('srcset').split(', ')
+ src_set_urls = {}
+ for src in srcset:
+ url, width = src.split()
+ width = int(width.rstrip('w'))
+ src_set_urls[width] = url
+ max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True)
+ movie.cover = max_pic[0][1]
+ except:
+ movie.cover = cover_tag[0].get('src')
+ body = tree.xpath("//section[@class='article-body']")[0]
+ title = body.xpath("div/p/text()")[0]
+ title = title.replace(f"【{movie.dvdid}】", '')
+ cite_url = body.xpath("div/cite/a/@href")[0]
+ cite_url = cite_url.split('?aff=')[0]
+ info = body.xpath("dl[@class='dltable']")[0]
+ dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd")
+ data = {}
+ for dt_txt, dd in zip(dt_txt_ls, dd_tags):
+ dt_txt = dt_txt.strip()
+ a_tag = dd.xpath('a')
+ if len(a_tag) == 0:
+ dd_txt = dd.text.strip()
+ else:
+ dd_txt = [i.text.strip() for i in a_tag]
+ if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留
+ dd_txt = dd_txt[0]
+ data[dt_txt] = dd_txt
+
+ ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'}
+ for key, attr in ATTR_MAP.items():
+ setattr(movie, attr, data.get(key))
+ movie.title = title
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await AvWikiCrawler.create()
+ movie = MovieInfo("259LUXU-593")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/dl_getchu.py b/javsp/crawlers/sites/dl_getchu.py
new file mode 100644
index 000000000..c34ad17c2
--- /dev/null
+++ b/javsp/crawlers/sites/dl_getchu.py
@@ -0,0 +1,132 @@
+"""从dl.getchu官网抓取数据"""
+import re
+import logging
+
+from javsp.config import CrawlerID
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.crawlers.interface import Crawler
+from javsp.network.client import get_session
+from javsp.network.utils import resolve_site_fallback
+from javsp.crawlers.exceptions import *
+from javsp.datatype import MovieInfo
+from lxml import html
+from lxml.html import HtmlElement
+
+def get_movie_title(tree: HtmlElement):
+ container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]")
+ if len(container) > 0:
+ container = container[0]
+ rows = container.xpath('.//tr')
+ title = ''
+ for row in rows:
+ for cell in row.xpath('.//td/div'):
+ # 获取单元格文本内容
+ if cell.text:
+ title = str(cell.text).strip()
+ return title
+
+
+def get_movie_img(tree: HtmlElement, getchu_id: str):
+ img_src = ''
+ container = tree.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]')
+ if len(container) > 0:
+ container = container[0]
+ img_src = container.get('src')
+ return img_src
+
+
+def get_movie_preview(tree: HtmlElement, getchu_id: str):
+ preview_pics = []
+ container = tree.xpath(f'//img[contains(@src, "{getchu_id}_")]')
+ if len(container) > 0:
+ for c in container:
+ preview_pics.append(c.get('src'))
+ return preview_pics
+
+
+DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分')
+
+
+class DlGetchuCrawler(Crawler):
+ id = CrawlerID.dl_getchu
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://dl.getchu.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 去除番号中的'GETCHU'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('GETCHU-'):
+ raise ValueError('Invalid GETCHU number: ' + movie.dvdid)
+ getchu_id = id_uc.replace('GETCHU-', '')
+ # 抓取网页
+ url = f'{self.base_url}i/item{getchu_id}'
+ r = await self.client.get(url)
+ if r.status == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ tree = html.fromstring((await r.read()).decode(encoding='euc_jp', errors='ignore'))
+ tree.make_links_absolute(base_url=str(self.base_url))
+ container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]")
+ if len(container) > 0:
+ container = container[0]
+ # 将表格提取为键值对
+ rows = container.xpath('.//table/tr')
+ kv_rows = [i for i in rows if len(i) == 2]
+ data = {}
+ for row in kv_rows:
+ # 获取单元格文本内容
+ key = row.xpath("td[@class='bluetext']/text()")[0]
+ # 是否包含a标签: 有的属性是用表示的,不是text
+ a_tags = row.xpath("td[2]/a")
+ if a_tags:
+ value = [i.text for i in a_tags]
+ else:
+ # 获取第2个td标签的内容(下标从1开始计数)
+ value = row.xpath("td[2]/text()")
+ data[key] = value
+
+ for key, value in data.items():
+ if key == 'サークル':
+ movie.producer = value[0]
+ elif key == '作者':
+ # 暂时没有在getchu找到多个actress的片子
+ movie.actress = [i.strip() for i in value]
+ elif key == '画像数&ページ数':
+ match = DURATION_PATTERN.search(' '.join(value))
+ if match:
+ movie.duration = match.group(1)
+ elif key == '配信開始日':
+ movie.publish_date = value[0].replace('/', '-')
+ elif key == '趣向':
+ movie.genre = value
+ elif key == '作品内容':
+ idx = -1
+ for i, line in enumerate(value):
+ if line.lstrip().startswith('※'):
+ idx = i
+ break
+ movie.plot = ''.join(value[:idx])
+
+ movie.title = get_movie_title(tree)
+ movie.cover = get_movie_img(tree, getchu_id)
+ movie.preview_pics = get_movie_preview(tree, getchu_id)
+ movie.dvdid = id_uc
+ movie.url = url
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await DlGetchuCrawler.create()
+ movie = MovieInfo('getchu-4041026')
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/fanza.py b/javsp/crawlers/sites/fanza.py
new file mode 100644
index 000000000..b81ac93ae
--- /dev/null
+++ b/javsp/crawlers/sites/fanza.py
@@ -0,0 +1,250 @@
+"""从fanza抓取数据"""
+
+import re
+import json
+import logging
+from typing import Dict, List, Tuple
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.config import Cfg
+from javsp.datatype import MovieInfo
+
+from lxml import html
+from lxml.html import HtmlElement
+from aiohttp import ClientResponse
+
+logger = logging.getLogger(__name__)
+
+
+_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1}
+_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1}
+def sort_search_result(result: List[Dict]):
+ """排序搜索结果"""
+ scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result}
+ sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True)
+ return sorted_result
+
+
+async def resp2html_wrapper(resp: ClientResponse) -> HtmlElement:
+ tree = html.fromstring(await resp.text())
+ if 'not available in your region' in tree.text_content():
+ raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
+ elif '/login/' in str(resp.url):
+ raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP')
+ return tree
+
+
+
+
+def parse_anime_page(movie: MovieInfo, tree: HtmlElement):
+ """解析动画影片的页面布局"""
+ title = tree.xpath("//h1[@id='title']/text()")[0]
+ container = tree.xpath("//table[@class='mg-b12']/tr/td")[0]
+ cover = container.xpath("//img[@name='package-image']/@src")[0]
+ date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip()
+ publish_date = date_str.replace('/', '-')
+ duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")
+ if duration_tag:
+ movie.duration = duration_tag[0].strip().replace('分', '')
+ serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0].strip()
+ producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
+ if producer_tag:
+ movie.producer = producer_tag[0].strip()
+ genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ genre.append(tag.text.strip())
+ genre_id.append(tag.get('href').split('=')[-1].strip('/'))
+ cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
+ plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip()
+ preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy")
+ score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
+ score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
+
+ movie.cid = cid
+ movie.title = title
+ movie.cover = cover
+ movie.publish_date = publish_date
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.plot = plot
+ movie.score = f'{score/5:.2f}' # 转换为10分制
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+# parse_dvd_page = parse_videoa_page # 118wtktabf067
+# parse_ppr_page = parse_videoa_page
+# parse_nikkatsu_page = parse_videoa_page
+# parse_doujin_page = parse_anime_page
+
+class FanzaCrawler(Crawler):
+ id = CrawlerID.fanza
+ headers: Dict[str, str]
+
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.dmm.co.jp')
+ self.base_url = str(url)
+ self.client = get_session(url)
+
+ # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
+ self.client.cookie_jar.update_cookies({'age_check_done': '1'})
+ self.headers = {'Accept-Language': 'ja,en-US;q=0.9'}
+ return self
+
+ async def get_urls_of_cid(self, cid: str) -> Tuple[str, str]:
+ """搜索cid可能的影片URL"""
+ r = await self.client.get(f"{self.base_url}search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0", headers=self.headers)
+ if r.status == 404:
+ raise MovieNotFoundError(__name__, cid)
+ r.raise_for_status()
+
+ tree = await resp2html_wrapper(r)
+ result = tree.xpath("//ul[@id='list']/li/div/p/a/@href")
+ parsed_result = {}
+ for url in result:
+ items = url.split('/')
+ type_, cid = None, None
+ for i, part in enumerate(items):
+ if part == '-':
+ product, type_ = items[i-2], items[i-1]
+ elif part.startswith('cid='):
+ cid = part[4:]
+ new_url = '/'.join(i for i in items if not i.startswith('?')) + '/'
+ parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url})
+ break
+ if cid not in parsed_result:
+ if len(result) > 0:
+ logger.debug(f"Unknown URL in search result: " + ', '.join(result))
+ raise MovieNotFoundError(__name__, cid)
+ sorted_result = sort_search_result(parsed_result[cid])
+ return sorted_result
+
+ async def dispatch(self, type: str, movie: MovieInfo, tree: HtmlElement):
+ match type:
+ case 'videoa' | 'dvd' | 'ppr' | 'nikkatsu':
+ await self.parse_videoa_page(movie, tree)
+ case 'anime' | 'doujin':
+ parse_anime_page(movie, tree)
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ default_url = f'{self.base_url}digital/videoa/-/detail/=/cid={movie.cid}/'
+ r0 = await self.client.get(default_url, headers=self.headers)
+ if r0.status == 404:
+ urls = await self.get_urls_of_cid(movie.cid)
+ for d in urls:
+ try:
+ r = await self.client.get(d['url'], headers=self.headers)
+ tree = await resp2html_wrapper(r)
+ await self.dispatch(d['type'], movie, tree)
+ movie.url = d['url']
+ break
+ except:
+ logger.debug(f"Fail to parse {d['url']}", exc_info=True)
+ if d is urls[-1]:
+ logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败")
+ raise
+ else:
+ tree = resp2html_wrapper(r0)
+ await self.parse_videoa_page(movie, tree)
+ movie.url = default_url
+
+ async def parse_videoa_page(self, movie: MovieInfo, tree: HtmlElement):
+ """解析AV影片的页面布局"""
+ title = tree.xpath("//div[@class='hreview']/h1/text()")[0]
+ # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
+ container = tree.xpath("//table[@class='mg-b12']/tr/td")[0]
+ cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
+ # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
+ date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()")
+ if date_tag:
+ movie.publish_date = date_tag[0].strip().replace('/', '-')
+ duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
+ actress = container.xpath("//span[@id='performer']/a/text()")
+ director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()")
+ if director_tag:
+ movie.director = director_tag[0].strip()
+ serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0].strip()
+ producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
+ if producer_tag:
+ movie.producer = producer_tag[0].strip()
+ # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
+ # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
+ # if label_tag:
+ # label = label_tag[0].strip()
+ # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
+ genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ genre.append(tag.text.strip())
+ genre_id.append(tag.get('href').split('=')[-1].strip('/'))
+ cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
+ plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip()
+ preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
+ score_tag = container.xpath("//p[@class='d-review__average']/strong/text()")
+ if score_tag:
+ match = re.search(r'\d+', score_tag[0].strip())
+ if match:
+ score = float(match.group()) * 2
+ movie.score = f'{score:.2f}'
+ else:
+ score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
+ movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
+
+ if Cfg().crawler.hardworking:
+ # 预览视频是动态加载的,不在静态网页中
+ video_url = f'{self.base_url}service/digitalapi/-/html5_player/=/cid={movie.cid}'
+ resp = await self.client.get(video_url, headers=self.headers)
+ tree2 = html.fromstring(await resp.text())
+ # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
+ script = tree2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip()
+ match = re.search(r'\{.*\}', script)
+ # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
+ try:
+ data = json.loads(match.group())
+ video_url = data.get('src')
+ if video_url and video_url.startswith('//'):
+ video_url = 'https:' + video_url
+ movie.preview_video = video_url
+ except Exception as e:
+ logger.debug('解析视频地址时异常: ' + repr(e))
+
+ movie.cid = cid
+ movie.title = title
+ movie.cover = cover
+ movie.actress = actress
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.plot = plot
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await FanzaCrawler.create()
+ movie = MovieInfo(cid="d_aisoft3356")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
+
diff --git a/javsp/crawlers/sites/fc2.py b/javsp/crawlers/sites/fc2.py
new file mode 100644
index 000000000..01deffdab
--- /dev/null
+++ b/javsp/crawlers/sites/fc2.py
@@ -0,0 +1,121 @@
+"""从FC2官网抓取数据"""
+import logging
+
+from lxml import html
+
+
+from javsp.crawlers.exceptions import *
+from javsp.config import Cfg
+from javsp.lib import strftime_to_minutes
+from javsp.datatype import MovieInfo
+from javsp.crawlers.interface import Crawler
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.config import CrawlerID
+
+
+logger = logging.getLogger(__name__)
+
+class Fc2Crawler(Crawler):
+ id = CrawlerID.fc2
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://adult.contents.fc2.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def get_movie_score(self, fc2_id: str) -> float | None:
+ """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None"""
+ resp = await self.client.get(f'{self.base_url}/article/{fc2_id}/review')
+ tree = html.fromstring(await resp.text())
+ review_tags = tree.xpath("//ul[@class='items_comment_headerReviewInArea']/li")
+ reviews = {}
+ for tag in review_tags:
+ score = int(tag.xpath("div/span/text()")[0])
+ vote = int(tag.xpath("span")[0].text_content())
+ reviews[score] = vote
+ total_votes = sum(reviews.values())
+ if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧
+ summary = sum([k*v for k, v in reviews.items()])
+ final_score = summary / total_votes * 2 # 乘以2转换为10分制
+ return final_score
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 去除番号中的'FC2'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('FC2-'):
+ raise ValueError('Invalid FC2 number: ' + movie.dvdid)
+ fc2_id = id_uc.replace('FC2-', '')
+ # 抓取网页
+ url = f'{self.base_url}/article/{fc2_id}/'
+ resp = await self.client.get(url)
+ if '/id.fc2.com/' in str(resp.url):
+ raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP')
+ tree = html.fromstring(await resp.text())
+ tree.make_links_absolute(base_url=self.base_url)
+ container = tree.xpath("//div[@class='items_article_left']")
+ if len(container) > 0:
+ container = container[0]
+ else:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ # FC2 标题增加反爬乱码,使用数组合并标题
+ title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()")
+ title = ''.join(title_arr)
+ thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0]
+ thumb_pic = thumb_tag.xpath("span/img/@src")[0]
+ duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0]
+ # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商
+ producer = container.xpath("//li[text()='by ']/a/text()")[0]
+ genre = container.xpath("//a[@class='tag tagTag']/text()")
+ date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0]
+ publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30'
+ preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href")
+
+ if Cfg().crawler.hardworking:
+ # 通过评论数据来计算准确的评分
+ score = await self.get_movie_score(fc2_id)
+ if score:
+ movie.score = f'{score:.2f}'
+ # 预览视频是动态加载的,不在静态网页中
+ desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0]
+ key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa...
+ api_url = f'{self.base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
+ resp = await self.client.get(api_url)
+ j = await resp.json()
+ movie.preview_video = j['path']
+ else:
+ # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星
+ score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0]
+ score = int(score_tag_attr[-1]) * 2
+ movie.score = f'{score:.2f}'
+
+ movie.dvdid = id_uc
+ movie.url = url
+ movie.title = title
+ movie.genre = genre
+ movie.producer = producer
+ movie.duration = str(strftime_to_minutes(duration_str))
+ movie.publish_date = publish_date
+ movie.preview_pics = preview_pics
+ # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
+ if movie.preview_pics:
+ movie.cover = preview_pics[0]
+ else:
+ movie.cover = thumb_pic
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await Fc2Crawler.create()
+ movie = MovieInfo("FC2-718323")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/fc2ppvdb.py b/javsp/crawlers/sites/fc2ppvdb.py
new file mode 100644
index 000000000..8ae6d7415
--- /dev/null
+++ b/javsp/crawlers/sites/fc2ppvdb.py
@@ -0,0 +1,93 @@
+"""从FC2PPVDB抓取数据"""
+
+# BUG: This crawler doesn't work, seemed due to cloudflare
+
+from ssl import ALERT_DESCRIPTION_HANDSHAKE_FAILURE
+from typing import List
+
+
+from javsp.crawlers.exceptions import *
+from javsp.lib import strftime_to_minutes
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+
+class Fc2PpvDbCrawler(Crawler):
+ id = CrawlerID.fc2ppvdb
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://fc2ppvdb.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+
+ def get_list_first(list: List):
+ return list[0] if list and len(list) > 0 else None
+
+ # 去除番号中的'FC2'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('FC2-'):
+ raise ValueError('Invalid FC2 number: ' + movie.dvdid)
+ fc2_id = id_uc.replace('FC2-', '')
+ # 抓取网页
+ url = f'{self.base_url}/articles/{fc2_id}'
+ resp = await self.client.get(url)
+ tree = html.fromstring(await resp.text())
+ # html = get_html(url)
+ container = tree.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]")
+ if len(container) > 0:
+ container = container[0]
+ else:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ title = container.xpath("//h2/a/text()")
+ thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src")
+ duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()")
+ actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()")
+ genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()")
+ publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()")
+ publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()")
+ uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()")
+ uncensored_str_f = get_list_first(uncensored_str);
+ uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None
+ preview_pics = None
+ preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href")
+
+ movie.dvdid = id_uc
+ movie.url = url
+ movie.title = get_list_first(title)
+ movie.genre = genre
+ movie.actress = actress
+ movie.duration = str(strftime_to_minutes(get_list_first(duration_str)))
+ movie.publish_date = get_list_first(publish_date)
+ movie.publisher = get_list_first(publisher)
+ movie.uncensored = uncensored
+ movie.preview_pics = preview_pics
+ movie.preview_video = get_list_first(preview_video)
+
+ # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
+ if movie.preview_pics:
+ movie.cover = preview_pics[0]
+ else:
+ movie.cover = get_list_first(thumb_pic)
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await Fc2PpvDbCrawler.create()
+ movie = MovieInfo('FC2-4497837')
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/gyutto.py b/javsp/crawlers/sites/gyutto.py
new file mode 100644
index 000000000..8f294c8f2
--- /dev/null
+++ b/javsp/crawlers/sites/gyutto.py
@@ -0,0 +1,107 @@
+"""从https://gyutto.com/官网抓取数据"""
+import logging
+import time
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+from lxml.html import HtmlElement
+
+logger = logging.getLogger(__name__)
+
+def get_movie_title(tree: HtmlElement) -> str:
+ container = tree.xpath("//h1")
+ if len(container) > 0:
+ container = container[0]
+ title = container.text
+
+ return title
+
+def get_movie_img(tree: HtmlElement, index = 1) -> list[str]:
+ images = []
+ container = tree.xpath("//a[@class='highslide']/img")
+ if len(container) > 0:
+ if index == 0:
+ return container[0].get('src')
+
+ for row in container:
+ images.append(row.get('src'))
+
+ return images
+
+class GyuttoCrawler(Crawler):
+ id = CrawlerID.gyutto
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'http://gyutto.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 去除番号中的'gyutto'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('GYUTTO-'):
+ raise ValueError('Invalid gyutto number: ' + movie.dvdid)
+ gyutto_id = id_uc.replace('GYUTTO-', '')
+ # 抓取网页
+ url = f'{self.base_url}/i/item{gyutto_id}?select_uaflag=1'
+ r = await self.client.get(url)
+ if r.status == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ tree = html.fromstring(await r.text())
+ tree.make_links_absolute(self.base_url)
+ container = tree.xpath("//dl[@class='BasicInfo clearfix']")
+
+ producer = None
+ genre = None
+ date = None
+ publish_date = None
+
+ for row in container:
+ key = row.xpath(".//dt/text()")
+ if key[0] == "サークル":
+ producer = ''.join(row.xpath(".//dd/a/text()"))
+ elif key[0] == "ジャンル":
+ genre = row.xpath(".//dd/a/text()")
+ elif key[0] == "配信開始日":
+ date = row.xpath(".//dd/text()")
+ date_str = ''.join(date)
+ date_time = time.strptime(date_str, "%Y年%m月%d日")
+ publish_date = time.strftime("%Y-%m-%d", date_time)
+
+ plot = tree.xpath("//div[@class='unit_DetailLead']/p/text()")[0]
+
+ movie.title = get_movie_title(tree)
+ movie.cover = get_movie_img(tree, 0)
+ movie.preview_pics = get_movie_img(tree)
+ movie.dvdid = id_uc
+ movie.url = url
+ movie.producer = producer
+ # movie.actress = actress
+ # movie.duration = duration
+ movie.publish_date = publish_date
+ movie.genre = genre
+ movie.plot = plot
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await GyuttoCrawler.create()
+ movie = MovieInfo('gyutto-266923')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/jav321.py b/javsp/crawlers/sites/jav321.py
new file mode 100644
index 000000000..6a50da46e
--- /dev/null
+++ b/javsp/crawlers/sites/jav321.py
@@ -0,0 +1,118 @@
+"""从jav321抓取数据"""
+import re
+import logging
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+class Jav321Crawler(Crawler):
+ id = CrawlerID.jav321
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.jav321.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+
+ """解析指定番号的影片数据"""
+ resp = await self.client.post(f'{self.base_url}/search', data={'sn': movie.dvdid})
+ tree = html.fromstring(await resp.text())
+ tree.make_links_absolute(self.base_url)
+ page_url = tree.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
+ #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
+ cid = page_url.split('/')[-1] # /video/ipx00177
+ # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
+ if cid == 'search':
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ title = tree.xpath("//div[@class='panel-heading']/h3/text()")[0]
+ info = tree.xpath("//div[@class='col-md-9']")[0]
+ # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签
+ company_tags = info.xpath("a[contains(@href,'/company/')]/text()")
+ if company_tags:
+ movie.producer = company_tags[0]
+ # actress, actress_pics
+ # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白
+ actress, actress_pics = [], {}
+ actress_tags = tree.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
+ for tag in actress_tags:
+ name = tag.tail.strip()
+ pic_url = tag.get('src')
+ actress.append(name)
+ # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
+ # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
+ actress_pics[name] = pic_url
+ # genre, genre_id
+ genre_tags = info.xpath("a[contains(@href,'/genre/')]")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ genre.append(tag.text)
+ genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1
+ dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
+ publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
+ duration_str = info.xpath("b[text()='収録時間']")[0].tail
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
+ score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original")
+ if score_tag:
+ score = int(score_tag[0][5:7])/5 # /10*2
+ movie.score = str(score)
+ serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0]
+ preview_video_tag = info.xpath("//video/source/@src")
+ if preview_video_tag:
+ movie.preview_video = preview_video_tag[0]
+ plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()")
+ if plot_tag:
+ movie.plot = plot_tag[0]
+ preview_pics = tree.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src")
+ if len(preview_pics) == 0:
+ # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL
+ preview_pics = tree.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src")
+ # 有的图片链接里有多个//,网站质量堪忧……
+ preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics]
+ # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析
+
+ movie.url = page_url
+ movie.cid = cid
+ movie.dvdid = dvdid
+ movie.title = title
+ movie.actress = actress
+ movie.actress_pics = actress_pics
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.publish_date = publish_date
+ # preview_pics的第一张图始终是封面,剩下的才是预览图
+ if len(preview_pics) > 0:
+ movie.cover = preview_pics[0]
+ movie.preview_pics = preview_pics[1:]
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await Jav321Crawler.create()
+ movie = MovieInfo('SCUTE-1177')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javbus.py b/javsp/crawlers/sites/javbus.py
new file mode 100644
index 000000000..07d085563
--- /dev/null
+++ b/javsp/crawlers/sites/javbus.py
@@ -0,0 +1,132 @@
+"""从JavBus抓取数据"""
+import logging
+
+
+from javsp.crawlers.exceptions import *
+from javsp.func import *
+from javsp.config import CrawlerID
+from javsp.datatype import MovieInfo, GenreMap
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+
+from javsp.crawlers.interface import Crawler
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+class JavbusCrawler(Crawler):
+ id = CrawlerID.javbus
+ genre_map: GenreMap
+ perma_url: str
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ self.perma_url = 'https://www.javbus.com'
+ url = await resolve_site_fallback(self.id, self.perma_url)
+ self.base_url = str(url)
+ self.client = get_session(url)
+ self.client.cookie_jar.update_cookies({'age': 'verified', 'dv': '1'})
+ self.genre_map = GenreMap('data/genre_javbus.csv')
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ url = f'{self.base_url}/{movie.dvdid}'
+ resp = await self.client.get(url)
+
+ tree = html.fromstring(await resp.text())
+ tree.make_links_absolute(base_url=self.perma_url)
+ # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息
+ # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404
+ page_title = tree.xpath('/html/head/title/text()')
+ if page_title and page_title[0].startswith('404 Page Not Found!'):
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ container = tree.xpath("//div[@class='container']")[0]
+ title = container.xpath("h3/text()")[0]
+ cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
+ preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
+ info = container.xpath("//div[@class='col-md-3 info']")[0]
+ dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
+ publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
+ duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip()
+ director_tag = info.xpath("p/span[text()='導演:']")
+ if director_tag: # xpath没有匹配时将得到空列表
+ movie.director = director_tag[0].getnext().text.strip()
+ producer_tag = info.xpath("p/span[text()='製作商:']")
+ if producer_tag:
+ text = producer_tag[0].getnext().text
+ if text:
+ movie.producer = text.strip()
+ publisher_tag = info.xpath("p/span[text()='發行商:']")
+ if publisher_tag:
+ movie.publisher = publisher_tag[0].getnext().text.strip()
+ serial_tag = info.xpath("p/span[text()='系列:']")
+ if serial_tag:
+ movie.serial = serial_tag[0].getnext().text
+ # genre, genre_id
+ genre_tags = info.xpath("//span[@class='genre']/label/a")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ tag_url = tag.get('href')
+ pre_id = tag_url.split('/')[-1]
+ genre.append(tag.text)
+ if 'uncensored' in tag_url:
+ movie.uncensored = True
+ genre_id.append('uncensored-' + pre_id)
+ else:
+ movie.uncensored = False
+ genre_id.append(pre_id)
+ # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析
+ # actress, actress_pics
+ actress, actress_pics = [], {}
+ actress_tags = tree.xpath("//a[@class='avatar-box']/div/img")
+ for tag in actress_tags:
+ name = tag.get('title')
+ pic_url = tag.get('src')
+ actress.append(name)
+ if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像
+ actress_pics[name] = pic_url
+ # 整理数据并更新movie的相应属性
+ movie.url = f'{self.perma_url}/{movie.dvdid}'
+ movie.dvdid = dvdid
+ movie.title = title.replace(dvdid, '').strip()
+ movie.cover = cover
+ movie.preview_pics = preview_pics
+ if publish_date != '0000-00-00': # 丢弃无效的发布日期
+ movie.publish_date = publish_date
+ movie.duration = duration if int(duration) else None
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.actress = actress
+ movie.actress_pics = actress_pics
+
+ async def crawl_and_fill_cleaned(self, movie: MovieInfo):
+ """解析指定番号的影片数据并进行清洗"""
+ await self.crawl_and_fill(movie)
+ movie.genre_norm = self.genre_map.map(movie.genre_id)
+ movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换)
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavbusCrawler.create()
+ print(crawler.client.headers)
+ movie = MovieInfo('NANP-030')
+ # try:
+ await crawler.crawl_and_fill_cleaned(movie)
+ print(movie)
+ # except Exception as e:
+ # print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javdb.py b/javsp/crawlers/sites/javdb.py
new file mode 100644
index 000000000..dafa010d1
--- /dev/null
+++ b/javsp/crawlers/sites/javdb.py
@@ -0,0 +1,355 @@
+"""从JavDB抓取数据"""
+import os
+import re
+import logging
+from typing import Dict
+
+from javsp.func import *
+from javsp.avid import guess_av_type
+from javsp.config import CrawlerID
+from javsp.datatype import MovieInfo, GenreMap
+from javsp.chromium import get_browsers_cookies
+
+from javsp.crawlers.exceptions import CredentialError, MovieDuplicateError, MovieNotFoundError, SiteBlocked, SitePermissionError, WebsiteError
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+
+from javsp.crawlers.interface import Crawler
+from lxml import html
+
+logger = logging.getLogger(__name__)
+perma_url = 'https://www.javdb.com'
+
+class JavDbCrawler(Crawler):
+ id = CrawlerID.javdb
+ genre_map: GenreMap
+ cookies_pool: list
+ headers: Dict[str, str]
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, perma_url)
+ self.base_url = str(url)
+ self.client = get_session(url)
+ self.headers = {'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'}
+ self.genre_map = GenreMap('data/genre_javdb.csv')
+ self.cookies_pool = []
+ return self
+
+ async def get_html_wrapper(self, url: str):
+ """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题"""
+
+ r = await self.client.get(url)
+ if r.status == 200:
+ # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页
+ if r.history and '/login' in str(r.url):
+ # 仅在需要时去读取Cookies
+ if len(self.cookies_pool) == 0:
+ try:
+ self.cookies_pool = get_browsers_cookies()
+ except (PermissionError, OSError) as e:
+ logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True)
+ self.cookies_pool = []
+ except Exception as e:
+ logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True)
+ self.cookies_pool = []
+ if len(self.cookies_pool) > 0:
+ item = self.cookies_pool.pop()
+ # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies
+ self.client.cookie_jar.update_cookies = item['cookies']
+ cookies_source = (item['profile'], item['site'])
+ logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}')
+ return self.get_html_wrapper(url)
+ else:
+ raise CredentialError('JavDB: 所有浏览器Cookies均已过期')
+ elif r.history and 'pay' in str(r.url).split('/')[-1]:
+ raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'")
+ else:
+
+ tree = html.fromstring(await r.text())
+ tree.make_links_absolute(base_url=perma_url)
+ return tree
+ elif r.status in (403, 503):
+ tree = html.fromstring(await r.text())
+ code_tag = tree.xpath("//span[@class='code-label']/span")
+ error_code = code_tag[0].text if code_tag else None
+ if error_code:
+ if error_code == '1020':
+ block_msg = f'JavDB: {r.status} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器'
+ else:
+ block_msg = f'JavDB: {r.status} 禁止访问: {url} (Error code: {error_code})'
+ else:
+ block_msg = f'JavDB: {r.status} 禁止访问: {url}'
+ raise SiteBlocked(block_msg)
+ else:
+ raise WebsiteError(f'JavDB: {r.status} 非预期状态码: {url}')
+
+
+ async def get_user_info(self, site: str, cookies):
+ """获取cookies对应的JavDB用户信息"""
+ try:
+ self.client.cookies = cookies
+ resp = await self.client.get(f'https://{site}/users/profile')
+
+ html_str = await resp.text()
+ tree = html.fromstring(html_str)
+ except Exception as e:
+ logger.info('JavDB: 获取用户信息时出错')
+ logger.debug(e, exc_info=1)
+ return
+ # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点
+ if 'JavDB' in html_str:
+ email = tree.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip()
+ username = tree.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip()
+ return email, username
+ else:
+ logger.debug('JavDB: 域名已过期: ' + site)
+
+
+ async def get_valid_cookies(self):
+ """扫描浏览器,获取一个可用的Cookies"""
+ # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用
+ for d in self.cookies_pool:
+ info = await self.get_user_info(d['site'], d['cookies'])
+ if info:
+ return d['cookies']
+ else:
+ logger.debug(f"{d['profile']}, {d['site']}: Cookies无效")
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
+ tree = await self.get_html_wrapper(f'{self.base_url}/search?q={movie.dvdid}')
+ ids = list(map(str.lower, tree.xpath("//div[@class='video-title']/strong/text()")))
+ movie_urls = tree.xpath("//a[@class='box']/@href")
+ match_count = len([i for i in ids if i == movie.dvdid.lower()])
+ if match_count == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid, ids)
+ elif match_count == 1:
+ index = ids.index(movie.dvdid.lower())
+ new_url = movie_urls[index]
+ try:
+ html2 = await self.get_html_wrapper(new_url)
+ except (SitePermissionError, CredentialError):
+ # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面
+ box = tree.xpath("//a[@class='box']")[index]
+ movie.url = new_url
+ movie.title = box.get('title')
+ movie.cover = box.xpath("div/img/@src")[0]
+ score_str = box.xpath("div[@class='score']/span/span")[0].tail
+ score = re.search(r'([\d.]+)分', score_str).group(1)
+ movie.score = "{:.2f}".format(float(score)*2)
+ movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip()
+ return
+ else:
+ raise MovieDuplicateError(__name__, movie.dvdid, match_count)
+
+ container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0]
+ info = container.xpath("//nav[@class='panel movie-panel-info']")[0]
+ title = container.xpath("h2/strong[@class='current-title']/text()")[0]
+ show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]")
+ if show_orig_title:
+ movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0]
+ cover = container.xpath("//img[@class='video-cover']/@src")[0]
+ preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href")
+ preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src")
+ if preview_video_tag:
+ preview_video = preview_video_tag[0]
+ if preview_video.startswith('//'):
+ preview_video = 'https:' + preview_video
+ movie.preview_video = preview_video
+ dvdid = info.xpath("div/span")[0].text_content()
+ publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
+ duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip()
+ director_tag = info.xpath("div/strong[text()='導演:']")
+ if director_tag:
+ movie.director = director_tag[0].getnext().text_content().strip()
+ av_type = guess_av_type(movie.dvdid)
+ if av_type != 'fc2':
+ producer_tag = info.xpath("div/strong[text()='片商:']")
+ else:
+ producer_tag = info.xpath("div/strong[text()='賣家:']")
+ if producer_tag:
+ movie.producer = producer_tag[0].getnext().text_content().strip()
+ publisher_tag = info.xpath("div/strong[text()='發行:']")
+ if publisher_tag:
+ movie.publisher = publisher_tag[0].getnext().text_content().strip()
+ serial_tag = info.xpath("div/strong[text()='系列:']")
+ if serial_tag:
+ movie.serial = serial_tag[0].getnext().text_content().strip()
+ score_tag = info.xpath("//span[@class='score-stars']")
+ if score_tag:
+ score_str = score_tag[0].tail
+ score = re.search(r'([\d.]+)分', score_str).group(1)
+ movie.score = "{:.2f}".format(float(score)*2)
+ genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ pre_id = tag.get('href').split('/')[-1]
+ genre.append(tag.text)
+ genre_id.append(pre_id)
+ # 判定影片有码/无码
+ subsite = pre_id.split('?')[0]
+ movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite)
+ # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
+ actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
+ all_actors = actors_tag.xpath("a/text()")
+ genders = actors_tag.xpath("strong/text()")
+ actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
+ magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href")
+
+ movie.dvdid = dvdid
+ movie.url = self.base_url
+ movie.title = title.replace(dvdid, '').strip()
+ movie.cover = cover
+ movie.preview_pics = preview_pics
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.actress = actress
+ movie.magnet = [i.replace('[javdb.com]','') for i in magnet]
+
+
+ async def crawl_and_fill_cleaned(self, movie: MovieInfo):
+ """解析指定番号的影片数据并进行清洗"""
+ try:
+ await self.crawl_and_fill(movie)
+ # 检查封面URL是否真的存在对应图片
+ if movie.cover is not None:
+ r = await self.client.head(movie.cover)
+ if r.status != 200:
+ movie.cover = None
+ except SiteBlocked:
+ raise
+ logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试')
+ if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')):
+ movie.genre_norm = self.genre_map.map(movie.genre_id)
+ movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
+
+
+ async def collect_actress_alias(self, type=0, use_original=True):
+ """
+ 收集女优的别名
+ type: 0-有码, 1-无码, 2-欧美
+ use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬
+ """
+ import json
+ import time
+ import random
+
+ actressAliasMap = {}
+
+ actressAliasFilePath = "data/actress_alias.json"
+ # 检查文件是否存在
+ if not os.path.exists(actressAliasFilePath):
+ # 如果文件不存在,创建文件并写入空字典
+ with open(actressAliasFilePath, "w", encoding="utf-8") as file:
+ json.dump({}, file)
+
+ typeList = ["censored", "uncensored", "western"]
+ page_url = f"{self.base_url}/actors/{typeList[type]}"
+ while True:
+ try:
+ tree = await self.get_html_wrapper(page_url)
+ actors = tree.xpath("//div[@class='box actor-box']/a")
+
+ count = 0
+ for actor in actors:
+ count += 1
+ actor_name = actor.xpath("strong/text()")[0].strip()
+ actor_url = actor.xpath("@href")[0]
+ actor_url = self.base_url + actor_url # 构造演员主页的完整URL
+
+ # 进入演员主页,获取更多信息
+ actor_html = await self.get_html_wrapper(actor_url)
+ # 解析演员所有名字信息
+ names_span = actor_html.xpath("//span[@class='actor-section-name']")[0]
+ aliases_span_list = actor_html.xpath("//span[@class='section-meta']")
+ aliases_span = aliases_span_list[0]
+
+ names_list = [name.strip() for name in names_span.text.split(",")]
+ if len(aliases_span_list) > 1:
+ aliases_list = [
+ alias.strip() for alias in aliases_span.text.split(",")
+ ]
+ else:
+ aliases_list = []
+
+ # 将信息添加到actressAliasMap中
+ actressAliasMap[names_list[-1 if use_original else 0]] = (
+ names_list + aliases_list
+ )
+ print(
+ f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}"
+ )
+
+ if count == 10:
+ # 将数据写回文件
+ with open(actressAliasFilePath, "r", encoding="utf-8") as file:
+ existing_data = json.load(file)
+
+ # 合并现有数据和新爬取的数据
+ existing_data.update(actressAliasMap)
+
+ # 将合并后的数据写回文件
+ with open(actressAliasFilePath, "w", encoding="utf-8") as file:
+ json.dump(existing_data, file, ensure_ascii=False, indent=2)
+
+ actressAliasMap = {} # 重置actressAliasMap
+
+ print(
+ f"已爬取 {count} 个女优,数据已更新并写回文件:",
+ actressAliasFilePath,
+ )
+
+ # 重置计数器
+ count = 0
+
+ time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒
+
+ # 判断是否有下一页按钮
+ next_page_link = tree.xpath(
+ "//a[@rel='next' and @class='pagination-next']/@href"
+ )
+ if not next_page_link:
+ break # 没有下一页,结束循环
+ else:
+ next_page_url = f"{next_page_link[0]}"
+ page_url = next_page_url
+
+ except SiteBlocked:
+ raise
+
+ with open(actressAliasFilePath, "r", encoding="utf-8") as file:
+ existing_data = json.load(file)
+
+ # 合并现有数据和新爬取的数据
+ existing_data.update(actressAliasMap)
+
+ # 将合并后的数据写回文件
+ with open(actressAliasFilePath, "w", encoding="utf-8") as file:
+ json.dump(existing_data, file, ensure_ascii=False, indent=2)
+
+ print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath)
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ # breakpoint()
+ crawler = await JavDbCrawler.create()
+ movie = MovieInfo('FC2-2735981')
+ try:
+ await crawler.crawl_and_fill_cleaned(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javlib.py b/javsp/crawlers/sites/javlib.py
new file mode 100644
index 000000000..3832acbea
--- /dev/null
+++ b/javsp/crawlers/sites/javlib.py
@@ -0,0 +1,118 @@
+"""从JavLibrary抓取数据"""
+
+# BUG: This crawler doesn't work, seemed due to cloudflare
+
+import logging
+from urllib.parse import urlsplit
+
+from httpx._transports import base
+
+from javsp.crawlers.exceptions import MovieDuplicateError, MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+logger = logging.getLogger(__name__)
+
+class JavLibCrawler(Crawler):
+ id = CrawlerID.javlib
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.javlibrary.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ # TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ url = new_url = f'{self.base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
+ resp = await self.client.get(url)
+ tree = html.fromstring(await resp.text())
+ if resp.history and urlsplit(str(resp.url)).netloc == urlsplit(self.base_url).netloc:
+ # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果
+ new_url = resp.url
+ else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
+ video_tags = tree.xpath("//div[@class='video'][@id]/a")
+ # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
+ pre_choose = []
+ for tag in video_tags:
+ tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
+ if tag_dvdid.upper() == movie.dvdid.upper():
+ pre_choose.append(tag)
+ pre_choose_urls = [i.get('href') for i in pre_choose]
+ match_count = len(pre_choose)
+ if match_count == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ elif match_count == 1:
+ new_url = pre_choose_urls[0]
+ elif match_count == 2:
+ no_blueray = []
+ for tag in pre_choose:
+ if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc
+ no_blueray.append(tag)
+ no_blueray_count = len(no_blueray)
+ if no_blueray_count == 1:
+ new_url = no_blueray[0].get('href')
+ logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
+ else:
+ # 两个结果中没有谁是蓝光影片,说明影片番号重复了
+ raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
+ else:
+ # 存在不同影片但是番号相同的情况,如MIDV-010
+ raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
+ # 重新抓取网页
+ resp = await self.client.get(new_url)
+ tree = html.fromstring(await resp.text())
+ container = tree.xpath("/html/body/div/div[@id='rightcolumn']")[0]
+ title_tag = container.xpath("div/h3/a/text()")
+ title = title_tag[0]
+ cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
+ info = container.xpath("//div[@id='video_info']")[0]
+ dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
+ publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
+ duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
+ director_tag = info.xpath("//span[@class='director']/a/text()")
+ if director_tag:
+ movie.director = director_tag[0]
+ producer = info.xpath("//span[@class='maker']/a/text()")[0]
+ publisher_tag = info.xpath("//span[@class='label']/a/text()")
+ if publisher_tag:
+ movie.publisher = publisher_tag[0]
+ score_tag = info.xpath("//span[@class='score']/text()")
+ if score_tag:
+ movie.score = score_tag[0].strip('()')
+ genre = info.xpath("//span[@class='genre']/a/text()")
+ actress = info.xpath("//span[@class='star']/a/text()")
+
+ movie.dvdid = dvdid
+ movie.url = new_url
+ movie.title = title.replace(dvdid, '').strip()
+ if cover.startswith('//'): # 补全URL中缺少的协议段
+ cover = 'https:' + cover
+ movie.cover = cover
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.producer = producer
+ movie.genre = genre
+ movie.actress = actress
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavLibCrawler.create()
+ movie = MovieInfo('IPX-177')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javmenu.py b/javsp/crawlers/sites/javmenu.py
new file mode 100644
index 000000000..15ea78c0c
--- /dev/null
+++ b/javsp/crawlers/sites/javmenu.py
@@ -0,0 +1,100 @@
+"""从JavMenu抓取数据"""
+import logging
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+logger = logging.getLogger(__name__)
+
+class JavMenuCrawler(Crawler):
+ id = CrawlerID.javmenu
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.javmenu.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ # JavMenu网页做得很不走心,将就了
+ url = f'{self.base_url}zh/{movie.dvdid}'
+ r = await self.client.get(url)
+ if r.history:
+ # 被重定向到主页说明找不到影片资源
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ tree = html.fromstring(await r.text())
+ container = tree.xpath("//div[@class='col-md-9 px-0']")[0]
+ title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0]
+ # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站
+ title = title.replace(' | JAV目錄大全 | 每日更新', '')
+ title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '')
+ cover_tag = container.xpath("//div[@class='single-video']")
+ if len(cover_tag) > 0:
+ video_tag = cover_tag[0].find('video')
+ # URL首尾竟然也有空格……
+ movie.cover = video_tag.get('data-poster').strip()
+ # 预览影片改为blob了,无法获取
+ # movie.preview_video = video_tag.find('source').get('src').strip()
+ else:
+ cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src")
+ if cover_img_tag:
+ movie.cover = cover_img_tag[0].strip()
+ info = container.xpath("//div[@class='card-body']")[0]
+ publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text
+ duration = info.xpath("div/span[contains(text(), '时长:')]")[0].getnext().text.replace('分钟', '')
+ producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()")
+ if producer:
+ movie.producer = producer[0]
+ genre_tags = info.xpath("//a[@class='genre']")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ items = tag.get('href').split('/')
+ pre_id = items[-3] + '/' + items[-1]
+ genre.append(tag.text.strip())
+ genre_id.append(pre_id)
+ # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠……
+ actress = info.xpath("div/span[contains(text(), '女优:')]/following-sibling::*/a/text()") or None
+ magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody")
+ if magnet_table:
+ magnet_links = magnet_table[0].xpath("tr/td/a/@href")
+ # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以
+ movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links]
+ preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href")
+
+ if (not movie.cover) and preview_pics:
+ movie.cover = preview_pics[0]
+ movie.url = url
+ movie.title = title.replace(movie.dvdid, '').strip()
+ movie.preview_pics = preview_pics
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.actress = actress
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavMenuCrawler.create()
+ movie = MovieInfo('FC2-718323')
+ # try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ # except Exception as e:
+ # print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/mgstage.py b/javsp/crawlers/sites/mgstage.py
new file mode 100644
index 000000000..d21d5e3af
--- /dev/null
+++ b/javsp/crawlers/sites/mgstage.py
@@ -0,0 +1,127 @@
+"""从蚊香社-mgstage抓取数据"""
+import re
+import logging
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import Cfg, CrawlerID
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+class MgstageCrawler(Crawler):
+ id = CrawlerID.mgstage
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.mgstage.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
+ self.client.cookie_jar.update_cookies({'adc': '1'})
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ url = f'{self.base_url}product/product_detail/{movie.dvdid}/'
+ resp = await self.client.get(url)
+ if resp.status == 403:
+ raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
+ # url不存在时会被重定向至主页。history非空时说明发生了重定向
+ elif resp.history:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ tree = html.fromstring(await resp.text())
+ # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除
+ title = tree.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip()
+ container = tree.xpath("//div[@class='detail_left']")[0]
+ cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
+ # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
+ actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()")
+ actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()")
+ actress = [i.strip() for i in actress_text + actress_link]
+ actress = [i for i in actress if i] # 移除空字符串
+ producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
+ duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0]
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0]
+ date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0]
+ publish_date = date_str.replace('/', '-')
+ serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0].strip()
+ # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
+ # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip()
+ genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a")
+ genre = [i.text.strip() for i in genre_tags]
+ score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
+ match = re.search(r'^[\.\d]+', score_str)
+ if match:
+ score = float(match.group()) * 2
+ movie.score = f'{score:.2f}'
+ # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
+ plots = []
+ plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]")
+ for p in plot_p_tags:
+ children = p.getchildren()
+ # 没有children时表明plot不含有格式,此时简单地提取文本就可以
+ if not children:
+ plots.append(p.text_content())
+ continue
+ for child in children:
+ if child.tag == 'br' and plots[-1] != '\n':
+ plots.append('\n')
+ else:
+ if child.text:
+ plots.append(child.text)
+ if child.tail:
+ plots.append(child.tail)
+ plot = ''.join(plots).strip()
+ preview_pics = container.xpath("//a[@class='sample_image']/@href")
+
+ if Cfg().crawler.hardworking:
+ # 预览视频是点击按钮后再加载的,不在静态网页中
+ btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
+ video_pid = btn_url.split('/')[-1]
+ req_url = f'{self.base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
+ resp = await self.client.get(req_url)
+ j = await resp.json()
+ video_url = j.get('url')
+ if video_url:
+ # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
+ preview_video = video_url.split('.ism/')[0] + '.mp4'
+ movie.preview_video = preview_video
+
+ movie.dvdid = dvdid
+ movie.url = url
+ movie.title = title
+ movie.cover = cover
+ movie.actress = actress
+ movie.producer = producer
+ movie.publish_date = publish_date
+ movie.genre = genre
+ movie.plot = plot
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+if __name__ == "__main__":
+ async def test_main():
+ crawler = await MgstageCrawler.create()
+ movie = MovieInfo('ABF-153')
+ # try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ # except Exception as e:
+ # print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/njav.py b/javsp/crawlers/sites/njav.py
new file mode 100644
index 000000000..72826db02
--- /dev/null
+++ b/javsp/crawlers/sites/njav.py
@@ -0,0 +1,150 @@
+"""从NJAV抓取数据"""
+import re
+import logging
+from typing import List
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.lib import strftime_to_minutes
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+def get_list_first(list: List):
+ return list[0] if list and len(list) > 0 else None
+
+class NjavCrawler(Crawler):
+ id = CrawlerID.njav
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.njav.tv/')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ return self
+
+ async def search_video(self, movie: MovieInfo) -> str:
+ id_uc = movie.dvdid
+ # 抓取网页
+ url = f'{self.base_url}ja/search?keyword={id_uc}'
+ resp = await self.client.get(url)
+ tree = html.fromstring(await resp.text())
+ list = tree.xpath("//div[@class='box-item']/div[@class='detail']/a")
+ video_url = None
+ for item in list:
+ search_title = item.xpath("text()")[0]
+ if id_uc in search_title:
+ video_url = item.xpath("@href")
+ break
+ if id_uc.startswith("FC2-"):
+ fc2id = id_uc.replace('FC2-', '')
+ if "FC2" in search_title and fc2id in search_title:
+ video_url = item.xpath("@href")
+ break
+
+ return get_list_first(video_url)
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 抓取网页
+ url = await self.search_video(movie)
+ url = self.base_url + "ja/" + url
+ if not url:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ resp = await self.client.get(url)
+ tree = html.fromstring(await resp.text())
+ container = tree.xpath("//div[@class='container']/div/div[@class='col']")
+ if len(container) > 0:
+ container = container[0]
+ else:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0]
+ thumb_pic = container.xpath("//div[@id='player']/@data-poster")
+ plot = " ".join(container.xpath("//div[@class='description']/p/text()"))
+ magnet = container.xpath("//div[@class='magnet']/a/@href")
+ real_id = None
+ publish_date = None
+ duration_str = None
+ uncensored = None
+ preview_pics = None
+ preview_video = None
+ serial = None
+ publisher = None
+ producer = None
+ genre = []
+ actress = []
+
+ for item in container.xpath("//div[@class='detail-item']/div"):
+ item_title = item.xpath('span/text()')[0]
+ if "タグ:" in item_title:
+ genre += item.xpath("span")[1].xpath("a/text()")
+ elif "ジャンル:" in item_title:
+ genre += item.xpath("span")[1].xpath("a/text()")
+ elif "レーベル:" in item_title:
+ genre += item.xpath("span")[1].xpath("a/text()")
+ elif "女優:" in item_title:
+ actress = item.xpath("span")[1].xpath("a/text()")
+ elif "シリーズ:" in item_title:
+ serial = get_list_first(item.xpath("span")[1].xpath("a/text()"))
+ elif "メーカー:" in item_title:
+ producer = get_list_first(item.xpath("span")[1].xpath("a/text()"))
+ elif "コード:" in item_title:
+ real_id = get_list_first(item.xpath("span")[1].xpath("text()"))
+ elif "公開日:" in item_title:
+ publish_date = get_list_first(item.xpath("span")[1].xpath("text()"))
+ elif "再生時間:" in item_title:
+ duration_str = get_list_first(item.xpath("span")[1].xpath("text()"))
+
+ # 清除标题里的番号字符
+ keywords = [real_id, " "]
+ if movie.dvdid.startswith("FC2"):
+ keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]]
+ for keyword in keywords:
+ title = re.sub(re.escape(keyword), "", title, flags=re.I)
+
+ # 判断是否无码
+ uncensored_arr = magnet + [title]
+ for uncensored_str in uncensored_arr:
+ if 'uncensored' in uncensored_str.lower():
+ uncensored = True
+
+ movie.url = url
+ movie.title = title
+ movie.genre = genre
+ movie.actress = actress
+ movie.duration = str(strftime_to_minutes(duration_str))
+ movie.publish_date = publish_date
+ movie.publisher = publisher
+ movie.producer = producer
+ movie.uncensored = uncensored
+ movie.preview_pics = preview_pics
+ movie.preview_video = preview_video
+ movie.plot = plot
+ movie.serial = serial
+ movie.magnet = magnet
+
+ # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
+ if movie.preview_pics:
+ movie.cover = preview_pics[0]
+ else:
+ movie.cover = get_list_first(thumb_pic)
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await NjavCrawler.create()
+ movie = MovieInfo('012023_002')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/prestige.py b/javsp/crawlers/sites/prestige.py
new file mode 100644
index 000000000..a4fe7de41
--- /dev/null
+++ b/javsp/crawlers/sites/prestige.py
@@ -0,0 +1,101 @@
+"""从蚊香社-prestige抓取数据"""
+import re
+import logging
+
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_session
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+
+class PrestigeCrawler(Crawler):
+ id = CrawlerID.prestige
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.prestige-av.com')
+ self.base_url = str(url)
+ self.client = get_session(url)
+ # prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面
+ # (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取)
+ self.client.cookie_jar.update_cookies({'__age_auth__': 'true'})
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ url = f'{self.base_url}goods/goods_detail.php?sku={movie.dvdid}'
+ resp = await self.client.get(url)
+ if resp.status == 500:
+ # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ elif resp.status == 403:
+ raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
+ resp.raise_for_status()
+ tree = html.fromstring(await resp.text())
+ container_tags = tree.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']")
+ if not container_tags:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ container = container_tags[0]
+ title = container.xpath("h1/span")[0].tail.strip()
+ cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0]
+ cover = cover.split('?')[0]
+ actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()")
+ # 移除女优名中的空格,使女优名与其他网站保持一致
+ actress = [i.strip().replace(' ', '') for i in actress]
+ duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content()
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0]
+ publish_date = date_url.split('?date=')[-1]
+ producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip()
+ dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0]
+ genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a")
+ genre = [tag.text.strip() for tag in genre_tags]
+ serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip()
+ plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip()
+ preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src")
+ preview_pics = [i.split('?')[0] for i in preview_pics]
+
+ # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效
+ movie.url = url
+ movie.dvdid = dvdid
+ movie.title = title
+ movie.cover = cover
+ movie.actress = actress
+ movie.publish_date = publish_date
+ movie.producer = producer
+ movie.genre = genre
+ movie.serial = serial
+ movie.plot = plot
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await PrestigeCrawler.create()
+ movie = MovieInfo('ABP-647')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/func.py b/javsp/func.py
index 042afea5c..d10ba8222 100644
--- a/javsp/func.py
+++ b/javsp/func.py
@@ -16,6 +16,8 @@
from pathlib import Path
import importlib.metadata as meta
+from pydantic_core import Url
+
# 判断系统是否可以使用tk
USE_GUI = True
try:
@@ -23,7 +25,7 @@
except ImportError:
USE_GUI = False
-from javsp.web.base import *
+from javsp.network.utils import get_session, url_download
from javsp.lib import re_escape, resource_path
@@ -150,7 +152,7 @@ def split_by_punc(s):
return ls
-def check_update(allow_check=True, auto_update=True):
+async def check_update(allow_check=True, auto_update=True):
"""检查版本更新"""
def print_header(title, info=[]):
@@ -181,7 +183,9 @@ def print_header(title, info=[]):
release_url = 'https://github.com/Yuukiy/JavSP/releases/latest'
print('正在检查更新...', end='')
try:
- data = request_get(api_url, timeout=3).json()
+ client = get_session(Url(api_url))
+ resp = await client.get(api_url)
+ data = await resp.json()
latest_version = data['tag_name']
release_time = utc2local(data['published_at'])
release_date = release_time.isoformat().split('T')[0]
@@ -233,7 +237,7 @@ def print_header(title, info=[]):
if auto_update:
try:
logger.info('尝试自动更新到新版本: ' + latest_version + " (按'Ctrl+C'取消)")
- download_update(data)
+ await download_update(data)
except KeyboardInterrupt:
logger.info('用户取消更新')
except Exception as e:
@@ -243,7 +247,7 @@ def print_header(title, info=[]):
print() # 输出空行,作为新旧程序的分隔
-def download_update(rel_info):
+async def download_update(rel_info):
"""下载版本更新
Args:
@@ -253,7 +257,8 @@ def download_update(rel_info):
down_url = rel_info['assets'][0]['browser_download_url']
asset_name = rel_info['assets'][0]['name']
desc = '下载更新' if shutil.get_terminal_size().columns < 120 else '下载更新: '+asset_name
- download(down_url, asset_name, desc=desc)
+ await url_download(Url(down_url), asset_name, desc=desc)
+ # download(down_url, asset_name, desc=desc)
if os.path.exists(asset_name):
# 备份原有的程序
basepath, ext = os.path.splitext(sys.executable)
@@ -270,8 +275,3 @@ def download_update(rel_info):
p.wait()
p.terminate()
sys.exit(0)
-
-
-if __name__ == "__main__":
- setattr(sys, 'javsp_version', 'v0')
- check_update()
diff --git a/javsp/network/client.py b/javsp/network/client.py
new file mode 100644
index 000000000..981afeee4
--- /dev/null
+++ b/javsp/network/client.py
@@ -0,0 +1,61 @@
+"""网络请求的统一接口"""
+
+from typing import Any, Coroutine, Dict
+from pydantic_core import Url
+
+from javsp.config import Cfg
+from aiohttp import BaseConnector, ClientSession, TCPConnector
+from aiohttp_socks import ProxyConnector
+import asyncio
+
+default_headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
+}
+
+def get_proxy(unproxied: bool):
+ if Cfg().network.proxy_server is None or unproxied:
+ return None
+ else:
+ return str(Cfg().network.proxy_server)
+
+session_dictionary: Dict[str, ClientSession] = {}
+proxy_connector: BaseConnector | None = None
+def get_session(url: Url) -> ClientSession:
+ if url.host is None:
+ raise Exception(f"Unknown url {url}")
+ else:
+ index = url.host
+ if index in session_dictionary:
+ return session_dictionary[index]
+ else:
+ proxy = get_proxy(url.host in Cfg().network.unproxied)
+
+
+ connector: BaseConnector
+ if proxy is None:
+ connector = TCPConnector()
+ else:
+ global proxy_connector
+ if proxy_connector is None:
+ proxy_connector = ProxyConnector.from_url(proxy)
+ connector = proxy_connector
+
+ session = ClientSession(
+ connector=connector,
+ # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效
+ headers=default_headers.copy())
+
+
+ session_dictionary[index] = session
+
+ return session
+
+async def clear_clients():
+ close_tasks: list[Coroutine[Any, Any, None]] = []
+ for client in session_dictionary.values():
+ close_tasks.append(client.close())
+
+ await asyncio.gather(*close_tasks)
+
+ if proxy_connector is not None:
+ await proxy_connector.close()
diff --git a/javsp/network/utils.py b/javsp/network/utils.py
new file mode 100644
index 000000000..00e379098
--- /dev/null
+++ b/javsp/network/utils.py
@@ -0,0 +1,105 @@
+from datetime import timedelta
+import logging
+import time
+from aiohttp import ClientTimeout
+from tqdm.asyncio import tqdm
+from typing import Any, Coroutine, NamedTuple
+import aiofiles
+from pydantic.types import ByteSize
+from pydantic_core import Url
+
+from pydantic_extra_types.pendulum_dt import Duration
+
+from javsp.config import Cfg, CrawlerID
+from javsp.network.client import get_session, clear_clients
+
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+class DownloadInfo(NamedTuple):
+ size: ByteSize
+ elapsed: timedelta
+
+ def get_rate(self) -> float:
+ """get rate of this download, unit: Mbps"""
+ return self.size.to("mbit") / self.elapsed.total_seconds()
+
+async def url_download(url: Url, target_path: str, desc: str | None = None) -> DownloadInfo:
+ url_str = str(url)
+
+ if not desc:
+ desc = url_str.split('/')[-1]
+
+ s = get_session(url)
+
+ # REF: https://www.python-httpx.org/advanced/clients/#monitoring-download-progress
+ async with aiofiles.open(target_path, 'wb') as download_file:
+ # NOTE: Create a client for each request for now, need further refactor
+
+ start = time.monotonic()
+ async with s.get(url_str) as response:
+ total = response.content_length
+
+ with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress:
+ async for chunk in response.content.iter_any():
+ await download_file.write(chunk)
+ progress.update(len(chunk))
+
+ response_time = time.monotonic() - start
+ return DownloadInfo(ByteSize(total), timedelta(seconds=response_time))
+
+async def test_connect(url_str: str, timeout: Duration) -> bool:
+ """测试与指定url的连接,不使用映射,但使用代理"""
+ try:
+ s = get_session(Url(url_str))
+ response = \
+ await s.get(
+ url_str,
+ timeout=ClientTimeout(total=timeout.total_seconds()),
+ )
+ return response.status == 200
+ except Exception as e:
+ logger.debug(f"Not connectable: {url_str}\n" + repr(e))
+ return False
+
+async def choose_one_connectable(urls: list[str]) -> str | None:
+ co_connectables: list[Coroutine[Any, Any, bool]] = []
+ for url in urls:
+ co_connectables.append(test_connect(url, Duration(seconds=3)))
+
+ connectables = await asyncio.gather(*co_connectables)
+ for i, connectable in enumerate(connectables):
+ if connectable:
+ return urls[i]
+ return None
+
+async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url:
+ if cr_id not in Cfg().network.fallback:
+ return Url(default)
+
+ fallbacks = Cfg().network.fallback[cr_id]
+ chosen = await choose_one_connectable(fallbacks)
+ if chosen is None:
+ return Url(default)
+ else:
+ return Url(chosen)
+
+
+if __name__ == '__main__':
+ async def aentry():
+ print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com']))
+ from javsp.network.client import clear_clients
+ await clear_clients()
+
+ # async def aentry():
+ # print(await test_connect("https://www.y78k.com/", Duration(seconds=3)))
+
+ # async def aentry():
+ # await asyncio.gather(
+ # url_download(Url('https://www.google.com/images/branding/googlelogo/2x/googlelogo_light_color_272x92dp.png'), 'gogle_logo.png'),
+ # url_download(Url('https://ei.phncdn.com/www-static/images/pornhub_logo_straight.svg?cache=2024092501'), 'pornhub_logo.svg'),
+ # )
+ # await clear_clients()
+
+ asyncio.run(aentry())
diff --git a/javsp/web/translate.py b/javsp/translate.py
similarity index 82%
rename from javsp/web/translate.py
rename to javsp/translate.py
index 2e762cb15..66b8cb161 100644
--- a/javsp/web/translate.py
+++ b/javsp/translate.py
@@ -1,21 +1,19 @@
"""网页翻译接口"""
# 由于翻译服务不走代理,而且需要自己的错误处理机制,因此不通过base.py来管理网络请求
import time
-from typing import Union
import uuid
import random
import logging
from pydantic_core import Url
-import requests
from hashlib import md5
__all__ = ['translate', 'translate_movie_info']
-from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine
+from javsp.config import Cfg, TranslateEngine
from javsp.datatype import MovieInfo
-from javsp.web.base import read_proxy
+from javsp.network.client import get_session
logger = logging.getLogger(__name__)
@@ -49,13 +47,7 @@ def translate_movie_info(info: MovieInfo):
return False
return True
-def translate(texts, engine: Union[
- BaiduTranslateEngine,
- BingTranslateEngine,
- ClaudeTranslateEngine,
- OpenAITranslateEngine,
- None
- ], actress=[]):
+def translate(texts, engine: TranslateEngine, actress=[]):
"""
翻译入口:对错误进行处理并且统一返回格式
@@ -132,7 +124,7 @@ def translate(texts, engine: Union[
else:
return {'trans': texts}
-def baidu_translate(texts, app_id, api_key, to='zh'):
+async def baidu_translate(texts, app_id, api_key, to='zh'):
"""使用百度翻译文本(默认翻译为简体中文)"""
api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate"
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
@@ -146,13 +138,14 @@ def baidu_translate(texts, app_id, api_key, to='zh'):
wait = 1.0 - (now - last_access)
if wait > 0:
time.sleep(wait)
- r = requests.post(api_url, params=payload, headers=headers)
- result = r.json()
+ s = get_session(Url(api_url))
+ r = await s.post(api_url, params=payload, headers=headers)
+ result = await r.json()
baidu_translate._last_access = time.perf_counter()
return result
-def bing_translate(texts, api_key, to='zh-Hans'):
+async def bing_translate(texts, api_key, to='zh-Hans'):
"""使用Bing翻译文本(默认翻译为简体中文)"""
api_url = "https://api.cognitive.microsofttranslator.com/translate"
params = {'api-version': '3.0', 'to': to, 'includeSentenceLength': True}
@@ -163,34 +156,36 @@ def bing_translate(texts, api_key, to='zh-Hans'):
'X-ClientTraceId': str(uuid.uuid4())
}
body = [{'text': texts}]
- r = requests.post(api_url, params=params, headers=headers, json=body)
- result = r.json()
+ s = get_session(Url(api_url))
+ r = await s.post(api_url, params=params, headers=headers, json=body)
+ result = await r.json()
return result
_google_trans_wait = 60
-def google_trans(texts, to='zh_CN'):
+async def google_trans(texts, to='zh_CN'):
"""使用Google翻译文本(默认翻译为简体中文)"""
# API: https://www.jianshu.com/p/ce35d89c25c3
# client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017
global _google_trans_wait
url = f"https://translate.google.com.hk/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q={texts}"
- proxies = read_proxy()
- r = requests.get(url, proxies=proxies)
- while r.status_code == 429:
- logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试")
+ s = get_session(Url(url))
+ r = await s.get(url)
+ # TODO: retry已经集成到client里了,这里考虑删除
+ while r.status == 429:
+ logger.warning(f"HTTP {r.status}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试")
time.sleep(_google_trans_wait)
- r = requests.get(url, proxies=proxies)
- if r.status_code == 429:
+ r = await client.get(url)
+ if r.status == 429:
_google_trans_wait += random.randint(60, 90)
- if r.status_code == 200:
- result = r.json()
+ if r.status == 200:
+ result = await r.json()
else:
- result = {'error_code': r.status_code, 'error_msg': r.reason}
+ result = {'error_code': r.status, 'error_msg': r.reason}
time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间
return result
-def claude_translate(texts, api_key, to="zh_CN"):
+async def claude_translate(texts, api_key, to="zh_CN"):
"""使用Claude翻译文本(默认翻译为简体中文)"""
api_url = "https://api.anthropic.com/v1/messages"
headers = {
@@ -204,17 +199,20 @@ def claude_translate(texts, api_key, to="zh_CN"):
"max_tokens": 1024,
"messages": [{"role": "user", "content": texts}],
}
- r = requests.post(api_url, headers=headers, json=data)
- if r.status_code == 200:
- result = r.json().get("content", [{}])[0].get("text", "").strip()
+
+ s = get_session(Url(api_url))
+ r = await s.post(api_url, headers=headers, json=data)
+ j = await r.json()
+ if r.status == 200:
+ result = j.get("content", [{}])[0].get("text", "").strip()
else:
result = {
- "error_code": r.status_code,
- "error_msg": r.json().get("error", {}).get("message", r.reason),
+ "error_code": r.status,
+ "error_msg": j.get("error", {}).get("message", r.reason),
}
return result
-def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"):
+async def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"):
"""使用 OpenAI 翻译文本(默认翻译为简体中文)"""
api_url = str(url)
headers = {
@@ -236,18 +234,20 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"):
"temperature": 0,
"max_tokens": 1024,
}
- r = requests.post(api_url, headers=headers, json=data)
- if r.status_code == 200:
- if 'error' in r.json():
+ s = get_session(Url(api_url))
+ r = await s.post(api_url, headers=headers, json=data)
+ if r.status == 200:
+ j = await r.json()
+ if 'error' in j:
result = {
- "error_code": r.status_code,
- "error_msg": r.json().get("error", {}).get("message", ""),
+ "error_code": r.status,
+ "error_msg": j.get("error", {}).get("message", ""),
}
else:
- result = r.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip()
+ result = j.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
else:
result = {
- "error_code": r.status_code,
+ "error_code": r.status,
"error_msg": r.reason,
}
return result
diff --git a/javsp/web/airav.py b/javsp/web/airav.py
deleted file mode 100644
index 22e9fdbf7..000000000
--- a/javsp/web/airav.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""从airav抓取数据"""
-import re
-import logging
-from html import unescape
-
-
-from javsp.web.base import Request
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-# 初始化Request实例
-request = Request(use_scraper=True)
-request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
-# 近期airav服务器似乎不稳定,时好时坏,单次查询平均在17秒左右,timeout时间增加到20秒
-request.timeout = 20
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.airav.wiki'
-
-
-def search_movie(dvdid):
- """通过搜索番号获取指定的影片在网站上的ID"""
- # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片
- page = 0
- count = 1
- result = []
- while len(result) < count:
- url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
- r = request.get(url).json()
- # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
- if r['result']:
- result.extend(r['result'])
- count = r['count']
- page += 1
- else: # 结果为空,结束循环
- break
- # 如果什么都没搜索到,直接返回
- if not result:
- raise MovieNotFoundError(__name__, dvdid)
- # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472')
- result.sort(key=lambda x:x['barcode'])
- # 从所有搜索结果中选择最可能的番号,返回它的URL
- target = dvdid.replace('-', '_')
- for item in result:
- # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''}
- barcode = item['barcode'].replace('-', '_')
- if target in barcode:
- return item['barcode']
- raise MovieNotFoundError(__name__, dvdid, result)
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
- url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
- resp = request.get(url).json()
- # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
- if resp['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
- barcode = search_movie(movie.dvdid)
- if barcode:
- url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW'
- resp = request.get(url).json()
- if resp['count'] == 0:
- raise MovieNotFoundError(__name__, movie.dvdid, resp)
-
- # 从API返回的数据中提取需要的字段
- # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
- data = resp['result']
- dvdid = data['barcode']
- movie.dvdid = dvdid
- movie.url = base_url + '/video/' + dvdid
- # plot和title中可能含有HTML的转义字符,需要进行解转义处理
- movie.plot = unescape(data['description']) or None
- movie.cover = data['img_url']
- # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
- movie.genre = [i['name'] for i in data['tags']]
- movie.title = unescape(data['name'])
- movie.actress = [i['name'] for i in data['actors']]
- movie.publish_date = data['publish_date']
- movie.preview_pics = data['images'] or []
- if data['factories']:
- movie.producer = data['factories'][0]['name']
-
- if Cfg().crawler.hardworking:
- # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
- video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
- resp = request.get(video_url).json()
- # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
- if 'data' in resp:
- # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
- # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
- movie.preview_video = resp['data'].get('url')
-
- # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确
- for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'):
- if movie.title and keyword in movie.title:
- movie.title = None
- movie.genre = []
- if movie.plot and keyword in movie.plot:
- movie.plot = None
- movie.genre = []
- if not any([movie.title, movie.plot, movie.genre]):
- break
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('DSAD-938')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/arzon.py b/javsp/web/arzon.py
deleted file mode 100644
index 433949018..000000000
--- a/javsp/web/arzon.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""从arzon抓取数据"""
-import os
-import sys
-import logging
-import re
-
-from javsp.web.base import request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-import requests
-from lxml import html
-
-logger = logging.getLogger(__name__)
-base_url = "https://www.arzon.jp"
-
-def get_cookie():
- # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
- skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
- session = requests.Session()
- session.get(skip_verify_url, timeout=(12, 7))
- return session.cookies.get_dict()
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- full_id = movie.dvdid
- cookies = get_cookie()
- url = f'{base_url}/itemlist.html?t=&m=all&s=&q={full_id}'
- # url = f'{base_url}/imagelist.html?q={full_id}'
- r = request_get(url, cookies, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
- data = html.fromstring(r.content)
-
- urls = data.xpath("//h2/a/@href")
- if len(urls) == 0:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- item_url = base_url + urls[0]
- e = request_get(item_url, cookies, delay_raise=True)
- item = html.fromstring(e.content)
-
- title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0]
- cover = item.xpath("//td[@align='center']//a/img/@src")[0]
- item_text = item.xpath("//div[@class='item_text']/text()")
- plot = [item.strip() for item in item_text if item.strip() != ''][0]
- preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src")
- # 使用列表推导式添加 "http:" 并去除 "m_"
- preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr]
-
- container = item.xpath("//div[@class='item_register']/table//tr")
- for row in container:
- key = row.xpath("./td[1]/text()")[0]
- contents = row.xpath("./td[2]//text()")
- content = [item.strip() for item in contents if item.strip() != '']
- index = 0
- value = content[index] if content and index < len(content) else None
- if key == "AV女優:":
- movie.actress = content
- if key == "AVメーカー:":
- movie.producer = value
- if key == "AVレーベル:":
- video_type = value
- if key == "シリーズ:":
- movie.serial = value
- if key == "監督:":
- movie.director = value
- if key == "発売日:" and value:
- movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
- if key == "収録時間:" and value:
- movie.duration = re.search(r'([\d.]+)分', value).group(1)
- if key == "品番:":
- dvd_id = value
- elif key == "タグ:":
- genre = value
-
- genres = ''
- if video_type:
- genres = [video_type]
- if(genre != None):
- genres.append(genre)
-
- movie.genre = genres
- movie.url = item_url
- movie.title = title
- movie.plot = plot
- movie.cover = f'https:{cover}'
- movie.preview_pics = preview_pics
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('csct-011')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/arzon_iv.py b/javsp/web/arzon_iv.py
deleted file mode 100644
index 3ea7a322f..000000000
--- a/javsp/web/arzon_iv.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""从arzon抓取数据"""
-import os
-import sys
-import logging
-import re
-
-from javsp.web.base import request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-import requests
-from lxml import html
-
-logger = logging.getLogger(__name__)
-base_url = "https://www.arzon.jp"
-
-def get_cookie():
- # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
- skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
- session = requests.Session()
- session.get(skip_verify_url, timeout=(12, 7))
- return session.cookies.get_dict()
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- full_id = movie.dvdid
- cookies = get_cookie()
- url = f'{base_url}/imagelist.html?q={full_id}'
- r = request_get(url, cookies, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
- data = html.fromstring(r.content)
-
- urls = data.xpath("//h2/a/@href")
- if len(urls) == 0:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- item_url = base_url + urls[0]
- e = request_get(item_url, cookies, delay_raise=True)
- item = html.fromstring(e.content)
-
- title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0]
- cover = item.xpath("//td[@align='center']//a/img/@src")[0]
- item_text = item.xpath("//div[@class='item_text']/text()")
- plot = [item.strip() for item in item_text if item.strip() != ''][0]
-
- container = item.xpath("//div[@class='item_register']/table//tr")
- for row in container:
- key = row.xpath("./td[1]/text()")[0]
- contents = row.xpath("./td[2]//text()")
- content = [item.strip() for item in contents if item.strip() != '']
- index = 0
- value = content[index] if content and index < len(content) else None
- if key == "タレント:":
- movie.actress = content
- if key == "イメージメーカー:":
- movie.producer = value
- if key == "イメージレーベル:":
- video_type = value
- if key == "監督:":
- movie.director = value
- if key == "発売日:" and value:
- movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
- if key == "収録時間:" and value:
- movie.duration = re.search(r'([\d.]+)分', value).group(1)
- if key == "品番:":
- dvd_id = value
- elif key == "タグ:":
- genre = value
-
- genres = ''
- if video_type:
- genres = [video_type]
- if(genre != None):
- genres.append(genre)
-
- movie.genre = genres
- movie.url = item_url
- movie.title = title
- movie.plot = plot
- movie.cover = f'https:{cover}'
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('KIDM-1137B')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/avsox.py b/javsp/web/avsox.py
deleted file mode 100644
index ea96d6cc3..000000000
--- a/javsp/web/avsox.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""从avsox抓取数据"""
-import logging
-
-from javsp.web.base import get_html
-from javsp.web.exceptions import *
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = str(Cfg().network.proxy_free[CrawlerID.avsox])
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页
- full_id = movie.dvdid
- if full_id.startswith('FC2-'):
- full_id = full_id.replace('FC2-', 'FC2-PPV-')
- html = get_html(f'{base_url}tw/search/{full_id}')
- ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()")
- urls = html.xpath("//a[contains(@class, 'movie-box')]/@href")
- ids_lower = list(map(str.lower, ids))
- if full_id.lower() in ids_lower:
- url = urls[ids_lower.index(full_id.lower())]
- url = url.replace('/tw/', '/cn/', 1)
- else:
- raise MovieNotFoundError(__name__, movie.dvdid, ids)
-
- # 提取影片信息
- html = get_html(url)
- container = html.xpath("/html/body/div[@class='container']")[0]
- title = container.xpath("h3/text()")[0]
- cover = container.xpath("//a[@class='bigImage']/@href")[0]
- info = container.xpath("div/div[@class='col-md-3 info']")[0]
- dvdid = info.xpath("p/span[@style]/text()")[0]
- publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
- duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip()
- producer, serial = None, None
- producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
- if producer_tag:
- producer = producer_tag[0].text_content()
- serial_tag = info.xpath("p[text()='系列:']")
- if serial_tag:
- serial = serial_tag[0].getnext().xpath("a/text()")[0]
- genre = info.xpath("p/span[@class='genre']/a/text()")
- actress = container.xpath("//a[@class='avatar-box']/span/text()")
-
- movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-')
- movie.url = url
- movie.title = title.replace(dvdid, '').strip()
- movie.cover = cover
- movie.publish_date = publish_date
- movie.duration = duration
- movie.genre = genre
- movie.actress = actress
- if full_id.startswith('FC2-'):
- # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整
- movie.producer = serial
- else:
- movie.producer = producer
- movie.serial = serial
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('082713-417')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/avwiki.py b/javsp/web/avwiki.py
deleted file mode 100644
index fbd4ecbb3..000000000
--- a/javsp/web/avwiki.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""从av-wiki抓取数据"""
-import logging
-
-
-from javsp.web.base import *
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-logger = logging.getLogger(__name__)
-base_url = 'https://av-wiki.net'
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- movie.url = url = f'{base_url}/{movie.dvdid}'
- resp = request_get(url, delay_raise=True)
- if resp.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = resp2html(resp)
-
- cover_tag = html.xpath("//header/div/a[@class='image-link-border']/img")
- if cover_tag:
- try:
- srcset = cover_tag[0].get('srcset').split(', ')
- src_set_urls = {}
- for src in srcset:
- url, width = src.split()
- width = int(width.rstrip('w'))
- src_set_urls[width] = url
- max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True)
- movie.cover = max_pic[0][1]
- except:
- movie.cover = cover_tag[0].get('src')
- body = html.xpath("//section[@class='article-body']")[0]
- title = body.xpath("div/p/text()")[0]
- title = title.replace(f"【{movie.dvdid}】", '')
- cite_url = body.xpath("div/cite/a/@href")[0]
- cite_url = cite_url.split('?aff=')[0]
- info = body.xpath("dl[@class='dltable']")[0]
- dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd")
- data = {}
- for dt_txt, dd in zip(dt_txt_ls, dd_tags):
- dt_txt = dt_txt.strip()
- a_tag = dd.xpath('a')
- if len(a_tag) == 0:
- dd_txt = dd.text.strip()
- else:
- dd_txt = [i.text.strip() for i in a_tag]
- if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留
- dd_txt = dd_txt[0]
- data[dt_txt] = dd_txt
-
- ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'}
- for key, attr in ATTR_MAP.items():
- setattr(movie, attr, data.get(key))
- movie.title = title
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
-
- movie = MovieInfo('259LUXU-593')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/base.py b/javsp/web/base.py
deleted file mode 100644
index 717b5168a..000000000
--- a/javsp/web/base.py
+++ /dev/null
@@ -1,270 +0,0 @@
-"""网络请求的统一接口"""
-import os
-import sys
-import time
-import shutil
-import logging
-import requests
-import contextlib
-import cloudscraper
-import lxml.html
-from tqdm import tqdm
-from lxml import etree
-from lxml.html.clean import Cleaner
-from requests.models import Response
-
-
-from javsp.config import Cfg
-from javsp.web.exceptions import *
-
-
-__all__ = ['Request', 'get_html', 'post_html', 'request_get', 'resp2html', 'is_connectable', 'download', 'get_resp_text', 'read_proxy']
-
-
-headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
-
-logger = logging.getLogger(__name__)
-# 删除js脚本相关的tag,避免网页检测到没有js运行环境时强行跳转,影响调试
-cleaner = Cleaner(kill_tags=['script', 'noscript'])
-
-def read_proxy():
- if Cfg().network.proxy_server is None:
- return {}
- else:
- proxy = str(Cfg().network.proxy_server)
- return {'http': proxy, 'https': proxy}
-
-# 与网络请求相关的功能汇总到一个模块中以方便处理,但是不同站点的抓取器又有自己的需求(针对不同网站
-# 需要使用不同的UA、语言等)。每次都传递参数很麻烦,而且会面临函数参数越加越多的问题。因此添加这个
-# 处理网络请求的类,它带有默认的属性,但是也可以在各个抓取器模块里进行进行定制
-class Request():
- """作为网络请求出口并支持各个模块定制功能"""
- def __init__(self, use_scraper=False) -> None:
- # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效
- self.headers = headers.copy()
- self.cookies = {}
-
- self.proxies = read_proxy()
- self.timeout = Cfg().network.timeout.total_seconds()
- if not use_scraper:
- self.scraper = None
- self.__get = requests.get
- self.__post = requests.post
- self.__head = requests.head
- else:
- self.scraper = cloudscraper.create_scraper()
- self.__get = self._scraper_monitor(self.scraper.get)
- self.__post = self._scraper_monitor(self.scraper.post)
- self.__head = self._scraper_monitor(self.scraper.head)
-
- def _scraper_monitor(self, func):
- """监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求"""
- def wrapper(*args, **kw):
- try:
- return func(*args, **kw)
- except Exception as e:
- logger.debug(f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求")
- if func == self.scraper.get:
- return requests.get(*args, **kw)
- else:
- return requests.post(*args, **kw)
- return wrapper
-
- def get(self, url, delay_raise=False):
- r = self.__get(url,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
- def post(self, url, data, delay_raise=False):
- r = self.__post(url,
- data=data,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
- def head(self, url, delay_raise=True):
- r = self.__head(url,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
- def get_html(self, url):
- r = self.get(url)
- html = resp2html(r)
- return html
-
-
-class DownloadProgressBar(tqdm):
- def update_to(self, b=1, bsize=1, tsize=None):
- if tsize is not None:
- self.total = tsize
- self.update(b * bsize - self.n)
-
-
-def request_get(url, cookies={}, timeout=None, delay_raise=False):
- """获取指定url的原始请求"""
- if timeout is None:
- timeout = Cfg().network.timeout.seconds
-
- r = requests.get(url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout)
- if not delay_raise:
- if r.status_code == 403 and b'>Just a moment...<' in r.content:
- raise SiteBlocked(f"403 Forbidden: 无法通过CloudFlare检测: {url}")
- else:
- r.raise_for_status()
- return r
-
-
-def request_post(url, data, cookies={}, timeout=None, delay_raise=False):
- """向指定url发送post请求"""
- if timeout is None:
- timeout = Cfg().network.timeout.seconds
- r = requests.post(url, data=data, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
-
-def get_resp_text(resp: Response, encoding=None):
- """提取Response的文本"""
- if encoding:
- resp.encoding = encoding
- else:
- resp.encoding = resp.apparent_encoding
- return resp.text
-
-
-def get_html(url, encoding='utf-8'):
- """使用get方法访问指定网页并返回经lxml解析后的document"""
- resp = request_get(url)
- text = get_resp_text(resp, encoding=encoding)
- html = lxml.html.fromstring(text)
- html.make_links_absolute(url, resolve_base_href=True)
- # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus)
- # html = cleaner.clean_html(html)
- if hasattr(sys, 'javsp_debug_mode'):
- lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
- return html
-
-
-def resp2html(resp, encoding='utf-8') -> lxml.html.HtmlComment:
- """将request返回的response转换为经lxml解析后的document"""
- text = get_resp_text(resp, encoding=encoding)
- html = lxml.html.fromstring(text)
- html.make_links_absolute(resp.url, resolve_base_href=True)
- # html = cleaner.clean_html(html)
- if hasattr(sys, 'javsp_debug_mode'):
- lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
- return html
-
-
-def post_html(url, data, encoding='utf-8', cookies={}):
- """使用post方法访问指定网页并返回经lxml解析后的document"""
- resp = request_post(url, data, cookies=cookies)
- text = get_resp_text(resp, encoding=encoding)
- html = lxml.html.fromstring(text)
- # jav321提供ed2k形式的资源链接,其中的非ASCII字符可能导致转换失败,因此要先进行处理
- ed2k_tags = html.xpath("//a[starts-with(@href,'ed2k://')]")
- for tag in ed2k_tags:
- tag.attrib['ed2k'], tag.attrib['href'] = tag.attrib['href'], ''
- html.make_links_absolute(url, resolve_base_href=True)
- for tag in ed2k_tags:
- tag.attrib['href'] = tag.attrib['ed2k']
- tag.attrib.pop('ed2k')
- # html = cleaner.clean_html(html)
- # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
- return html
-
-
-def dump_xpath_node(node, filename=None):
- """将xpath节点dump到文件"""
- if not filename:
- filename = node.tag + '.html'
- with open(filename, 'wt', encoding='utf-8') as f:
- content = etree.tostring(node, pretty_print=True).decode('utf-8')
- f.write(content)
-
-
-def is_connectable(url, timeout=3):
- """测试与指定url的连接"""
- try:
- r = requests.get(url, headers=headers, timeout=timeout)
- return True
- except requests.exceptions.RequestException as e:
- logger.debug(f"Not connectable: {url}\n" + repr(e))
- return False
-
-
-def urlretrieve(url, filename=None, reporthook=None, headers=None):
- if "arzon" in url:
- headers["Referer"] = "https://www.arzon.jp/"
- """使用requests实现urlretrieve"""
- # https://blog.csdn.net/qq_38282706/article/details/80253447
- with contextlib.closing(requests.get(url, headers=headers,
- proxies=read_proxy(), stream=True)) as r:
- header = r.headers
- with open(filename, 'wb+') as fp:
- bs = 1024
- size = -1
- blocknum = 0
- if "content-length" in header:
- size = int(header["Content-Length"]) # 文件总大小(理论值)
- if reporthook: # 写入前运行一次回调函数
- reporthook(blocknum, bs, size)
- for chunk in r.iter_content(chunk_size=1024):
- if chunk:
- fp.write(chunk)
- fp.flush()
- blocknum += 1
- if reporthook:
- reporthook(blocknum, bs, size) # 每写入一次运行一次回调函数
-
-
-def download(url, output_path, desc=None):
- """下载指定url的资源"""
- # 支持“下载”本地资源,以供fc2fan的本地镜像所使用
- if not url.startswith('http'):
- start_time = time.time()
- shutil.copyfile(url, output_path)
- filesize = os.path.getsize(url)
- elapsed = time.time() - start_time
- info = {'total': filesize, 'elapsed': elapsed, 'rate': filesize/elapsed}
- return info
- if not desc:
- desc = url.split('/')[-1]
- referrer = headers.copy()
- referrer['referer'] = url[:url.find('/', 8)+1] # 提取base_url部分
- with DownloadProgressBar(unit='B', unit_scale=True,
- miniters=1, desc=desc, leave=False) as t:
- urlretrieve(url, filename=output_path, reporthook=t.update_to, headers=referrer)
- info = {k: t.format_dict[k] for k in ('total', 'elapsed', 'rate')}
- return info
-
-
-def open_in_chrome(url, new=0, autoraise=True):
- """使用指定的Chrome Profile打开url,便于调试"""
- import subprocess
- chrome = R'C:\Program Files\Google\Chrome\Application\chrome.exe'
- subprocess.run(f'"{chrome}" --profile-directory="Profile 2" {url}', shell=True)
-
-import webbrowser
-webbrowser.open = open_in_chrome
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- download('https://www.javbus.com/pics/cover/6n54_b.jpg', 'cover.jpg')
diff --git a/javsp/web/dl_getchu.py b/javsp/web/dl_getchu.py
deleted file mode 100644
index 15267f1f7..000000000
--- a/javsp/web/dl_getchu.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""从dl.getchu官网抓取数据"""
-import re
-import logging
-
-from javsp.web.base import resp2html, request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-logger = logging.getLogger(__name__)
-
-# https://dl.getchu.com/i/item4045373
-base_url = 'https://dl.getchu.com'
-# dl.getchu用utf-8会乱码
-base_encode = 'euc-jp'
-
-
-def get_movie_title(html):
- container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]")
- if len(container) > 0:
- container = container[0]
- rows = container.xpath('.//tr')
- title = ''
- for row in rows:
- for cell in row.xpath('.//td/div'):
- # 获取单元格文本内容
- if cell.text:
- title = str(cell.text).strip()
- return title
-
-
-def get_movie_img(html, getchu_id):
- img_src = ''
- container = html.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]')
- if len(container) > 0:
- container = container[0]
- img_src = container.get('src')
- return img_src
-
-
-def get_movie_preview(html, getchu_id):
- preview_pics = []
- container = html.xpath(f'//img[contains(@src, "{getchu_id}_")]')
- if len(container) > 0:
- for c in container:
- preview_pics.append(c.get('src'))
- return preview_pics
-
-
-DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分')
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'GETCHU'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('GETCHU-'):
- raise ValueError('Invalid GETCHU number: ' + movie.dvdid)
- getchu_id = id_uc.replace('GETCHU-', '')
- # 抓取网页
- url = f'{base_url}/i/item{getchu_id}'
- r = request_get(url, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = resp2html(r, base_encode)
- container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]")
- if len(container) > 0:
- container = container[0]
- # 将表格提取为键值对
- rows = container.xpath('.//table/tr')
- kv_rows = [i for i in rows if len(i) == 2]
- data = {}
- for row in kv_rows:
- # 获取单元格文本内容
- key = row.xpath("td[@class='bluetext']/text()")[0]
- # 是否包含a标签: 有的属性是用表示的,不是text
- a_tags = row.xpath("td[2]/a")
- if a_tags:
- value = [i.text for i in a_tags]
- else:
- # 获取第2个td标签的内容(下标从1开始计数)
- value = row.xpath("td[2]/text()")
- data[key] = value
-
- for key, value in data.items():
- if key == 'サークル':
- movie.producer = value[0]
- elif key == '作者':
- # 暂时没有在getchu找到多个actress的片子
- movie.actress = [i.strip() for i in value]
- elif key == '画像数&ページ数':
- match = DURATION_PATTERN.search(' '.join(value))
- if match:
- movie.duration = match.group(1)
- elif key == '配信開始日':
- movie.publish_date = value[0].replace('/', '-')
- elif key == '趣向':
- movie.genre = value
- elif key == '作品内容':
- idx = -1
- for i, line in enumerate(value):
- if line.lstrip().startswith('※'):
- idx = i
- break
- movie.plot = ''.join(value[:idx])
-
- movie.title = get_movie_title(html)
- movie.cover = get_movie_img(html, getchu_id)
- movie.preview_pics = get_movie_preview(html, getchu_id)
- movie.dvdid = id_uc
- movie.url = url
-
-
-if __name__ == "__main__":
- import pretty_errors
-
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('getchu-4041026')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fanza.py b/javsp/web/fanza.py
deleted file mode 100644
index e975c4c8f..000000000
--- a/javsp/web/fanza.py
+++ /dev/null
@@ -1,231 +0,0 @@
-"""从fanza抓取数据"""
-import os
-import re
-import sys
-import json
-import logging
-from typing import Dict, List, Tuple
-
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.dmm.co.jp'
-# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
-request = Request()
-request.cookies = {'age_check_done': '1'}
-request.headers['Accept-Language'] = 'ja,en-US;q=0.9'
-
-
-_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1}
-_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1}
-def sort_search_result(result: List[Dict]):
- """排序搜索结果"""
- scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result}
- sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True)
- return sorted_result
-
-
-def get_urls_of_cid(cid: str) -> Tuple[str, str]:
- """搜索cid可能的影片URL"""
- r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0")
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, cid)
- r.raise_for_status()
- html = resp2html_wrapper(r)
- result = html.xpath("//ul[@id='list']/li/div/p/a/@href")
- parsed_result = {}
- for url in result:
- items = url.split('/')
- type_, cid = None, None
- for i, part in enumerate(items):
- if part == '-':
- product, type_ = items[i-2], items[i-1]
- elif part.startswith('cid='):
- cid = part[4:]
- new_url = '/'.join(i for i in items if not i.startswith('?')) + '/'
- parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url})
- break
- if cid not in parsed_result:
- if len(result) > 0:
- logger.debug(f"Unknown URL in search result: " + ', '.join(result))
- raise MovieNotFoundError(__name__, cid)
- sorted_result = sort_search_result(parsed_result[cid])
- return sorted_result
-
-
-def resp2html_wrapper(resp):
- html = resp2html(resp)
- if 'not available in your region' in html.text_content():
- raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
- elif '/login/' in resp.url:
- raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP')
- return html
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
- r0 = request.get(default_url, delay_raise=True)
- if r0.status_code == 404:
- urls = get_urls_of_cid(movie.cid)
- for d in urls:
- func_name = f"parse_{d['type']}_page"
- if func_name in globals():
- parse_func = globals()[func_name]
- else:
- logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}")
- continue
- r = request.get(d['url'])
- html = resp2html_wrapper(r)
- try:
- parse_func(movie, html)
- movie.url = d['url']
- break
- except:
- logger.debug(f"Fail to parse {d['url']}", exc_info=True)
- if d is urls[-1]:
- logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败")
- raise
- else:
- html = resp2html_wrapper(r0)
- parse_videoa_page(movie, html)
- movie.url = default_url
-
-
-def parse_videoa_page(movie: MovieInfo, html):
- """解析AV影片的页面布局"""
- title = html.xpath("//div[@class='hreview']/h1/text()")[0]
- # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
- container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
- cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
- # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
- date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()")
- if date_tag:
- movie.publish_date = date_tag[0].strip().replace('/', '-')
- duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
- actress = container.xpath("//span[@id='performer']/a/text()")
- director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()")
- if director_tag:
- movie.director = director_tag[0].strip()
- serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
- if serial_tag:
- movie.serial = serial_tag[0].strip()
- producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
- if producer_tag:
- movie.producer = producer_tag[0].strip()
- # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
- # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
- # if label_tag:
- # label = label_tag[0].strip()
- # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
- genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]")
- genre, genre_id = [], []
- for tag in genre_tags:
- genre.append(tag.text.strip())
- genre_id.append(tag.get('href').split('=')[-1].strip('/'))
- cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
- plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip()
- preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
- score_tag = container.xpath("//p[@class='d-review__average']/strong/text()")
- if score_tag:
- match = re.search(r'\d+', score_tag[0].strip())
- if match:
- score = float(match.group()) * 2
- movie.score = f'{score:.2f}'
- else:
- score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
- movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
-
- if Cfg().crawler.hardworking:
- # 预览视频是动态加载的,不在静态网页中
- video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
- html2 = request.get_html(video_url)
- # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
- script = html2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip()
- match = re.search(r'\{.*\}', script)
- # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
- try:
- data = json.loads(match.group())
- video_url = data.get('src')
- if video_url and video_url.startswith('//'):
- video_url = 'https:' + video_url
- movie.preview_video = video_url
- except Exception as e:
- logger.debug('解析视频地址时异常: ' + repr(e))
-
- movie.cid = cid
- movie.title = title
- movie.cover = cover
- movie.actress = actress
- movie.genre = genre
- movie.genre_id = genre_id
- movie.plot = plot
- movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-def parse_anime_page(movie: MovieInfo, html):
- """解析动画影片的页面布局"""
- title = html.xpath("//h1[@id='title']/text()")[0]
- container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
- cover = container.xpath("//img[@name='package-image']/@src")[0]
- date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip()
- publish_date = date_str.replace('/', '-')
- duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")
- if duration_tag:
- movie.duration = duration_tag[0].strip().replace('分', '')
- serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
- if serial_tag:
- movie.serial = serial_tag[0].strip()
- producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
- if producer_tag:
- movie.producer = producer_tag[0].strip()
- genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]")
- genre, genre_id = [], []
- for tag in genre_tags:
- genre.append(tag.text.strip())
- genre_id.append(tag.get('href').split('=')[-1].strip('/'))
- cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
- plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip()
- preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy")
- score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
- score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
-
- movie.cid = cid
- movie.title = title
- movie.cover = cover
- movie.publish_date = publish_date
- movie.genre = genre
- movie.genre_id = genre_id
- movie.plot = plot
- movie.score = f'{score/5:.2f}' # 转换为10分制
- movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-# parse_dvd_page = parse_videoa_page # 118wtktabf067
-parse_ppr_page = parse_videoa_page
-parse_nikkatsu_page = parse_videoa_page
-parse_doujin_page = parse_anime_page
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo(cid='d_aisoft3356')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fc2.py b/javsp/web/fc2.py
deleted file mode 100644
index 66be7ae4e..000000000
--- a/javsp/web/fc2.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""从FC2官网抓取数据"""
-import logging
-
-
-from javsp.web.base import get_html, request_get, resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.lib import strftime_to_minutes
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://adult.contents.fc2.com'
-
-
-def get_movie_score(fc2_id):
- """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None"""
- html = get_html(f'{base_url}/article/{fc2_id}/review')
- review_tags = html.xpath("//ul[@class='items_comment_headerReviewInArea']/li")
- reviews = {}
- for tag in review_tags:
- score = int(tag.xpath("div/span/text()")[0])
- vote = int(tag.xpath("span")[0].text_content())
- reviews[score] = vote
- total_votes = sum(reviews.values())
- if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧
- summary = sum([k*v for k, v in reviews.items()])
- final_score = summary / total_votes * 2 # 乘以2转换为10分制
- return final_score
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'FC2'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('FC2-'):
- raise ValueError('Invalid FC2 number: ' + movie.dvdid)
- fc2_id = id_uc.replace('FC2-', '')
- # 抓取网页
- url = f'{base_url}/article/{fc2_id}/'
- resp = request_get(url)
- if '/id.fc2.com/' in resp.url:
- raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP')
- html = resp2html(resp)
- container = html.xpath("//div[@class='items_article_left']")
- if len(container) > 0:
- container = container[0]
- else:
- raise MovieNotFoundError(__name__, movie.dvdid)
- # FC2 标题增加反爬乱码,使用数组合并标题
- title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()")
- title = ''.join(title_arr)
- thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0]
- thumb_pic = thumb_tag.xpath("span/img/@src")[0]
- duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0]
- # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商
- producer = container.xpath("//li[text()='by ']/a/text()")[0]
- genre = container.xpath("//a[@class='tag tagTag']/text()")
- date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0]
- publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30'
- preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href")
-
- if Cfg().crawler.hardworking:
- # 通过评论数据来计算准确的评分
- score = get_movie_score(fc2_id)
- if score:
- movie.score = f'{score:.2f}'
- # 预览视频是动态加载的,不在静态网页中
- desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0]
- key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa...
- api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
- r = request_get(api_url).json()
- movie.preview_video = r['path']
- else:
- # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星
- score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0]
- score = int(score_tag_attr[-1]) * 2
- movie.score = f'{score:.2f}'
-
- movie.dvdid = id_uc
- movie.url = url
- movie.title = title
- movie.genre = genre
- movie.producer = producer
- movie.duration = str(strftime_to_minutes(duration_str))
- movie.publish_date = publish_date
- movie.preview_pics = preview_pics
- # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
- if movie.preview_pics:
- movie.cover = preview_pics[0]
- else:
- movie.cover = thumb_pic
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-718323')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fc2fan.py b/javsp/web/fc2fan.py
deleted file mode 100644
index 229b3e3df..000000000
--- a/javsp/web/fc2fan.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""解析fc2fan本地镜像的数据"""
-# FC2官网的影片下架就无法再抓取数据,如果用户有fc2fan的镜像,那可以尝试从镜像中解析影片数据
-import os
-import re
-import logging
-import lxml.html
-import requests
-
-
-from javsp.web.base import resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_path = str(Cfg().crawler.fc2fan_local_path)
-use_local_mirror = os.path.exists(base_path)
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- if use_local_mirror:
- html_file = f'{base_path}/{movie.dvdid}.html'
- if not os.path.exists(html_file):
- raise MovieNotFoundError(__name__, movie.dvdid, html_file)
- html = lxml.html.parse(html_file)
- else:
- url = f"https://fc2club.top/html/{movie.dvdid}.html"
- r = requests.get(url)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- elif r.text == '':
- raise WebsiteError(f'fc2fan: 站点不可用 (HTTP {r.status_code}): {url}')
- html = resp2html(r)
- try:
- container = html.xpath("//div[@class='col-sm-8']")[0]
- except IndexError:
- raise WebsiteError(f'fc2fan: 站点不可用')
- title = container.xpath("h3/text()")[0]
- score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
- match = re.search(r'\d+', score_str)
- if match:
- score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
- movie.score = f'{score:.1f}'
- resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
- if '无码' in resource_info:
- movie.uncensored = True
- elif '有码' in resource_info:
- movie.uncensored = False
- # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商
- producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text
- if producer:
- movie.producer = producer.strip()
- genre = container.xpath("h5/strong[text()='影片标签']/../a/text()")
- actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
- preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
- if use_local_mirror:
- preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
- # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到
-
- movie.title = title
- movie.genre = genre
- movie.actress = actress
- if preview_pics:
- movie.preview_pics = preview_pics
- movie.cover = preview_pics[0]
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-1879420')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fc2ppvdb.py b/javsp/web/fc2ppvdb.py
deleted file mode 100644
index b0ad60892..000000000
--- a/javsp/web/fc2ppvdb.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""从FC2PPVDB抓取数据"""
-import logging
-from typing import List
-
-
-from javsp.web.base import get_html
-from javsp.web.exceptions import *
-from javsp.lib import strftime_to_minutes
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://fc2ppvdb.com'
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'FC2'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('FC2-'):
- raise ValueError('Invalid FC2 number: ' + movie.dvdid)
- fc2_id = id_uc.replace('FC2-', '')
- # 抓取网页
- url = f'{base_url}/articles/{fc2_id}'
- html = get_html(url)
- container = html.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]")
- if len(container) > 0:
- container = container[0]
- else:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- title = container.xpath("//h2/a/text()")
- thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src")
- duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()")
- actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()")
- genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()")
- publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()")
- publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()")
- uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()")
- uncensored_str_f = get_list_first(uncensored_str);
- uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None
- preview_pics = None
- preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href")
-
- movie.dvdid = id_uc
- movie.url = url
- movie.title = get_list_first(title)
- movie.genre = genre
- movie.actress = actress
- movie.duration = str(strftime_to_minutes(get_list_first(duration_str)))
- movie.publish_date = get_list_first(publish_date)
- movie.publisher = get_list_first(publisher)
- movie.uncensored = uncensored
- movie.preview_pics = preview_pics
- movie.preview_video = get_list_first(preview_video)
-
- # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
- if movie.preview_pics:
- movie.cover = preview_pics[0]
- else:
- movie.cover = get_list_first(thumb_pic)
-
-def get_list_first(list:List):
- return list[0] if list and len(list) > 0 else None
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-4497837')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/gyutto.py b/javsp/web/gyutto.py
deleted file mode 100644
index db7d6c795..000000000
--- a/javsp/web/gyutto.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""从https://gyutto.com/官网抓取数据"""
-import logging
-import time
-
-from javsp.web.base import resp2html, request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-logger = logging.getLogger(__name__)
-
-# https://dl.gyutto.com/i/item266923
-base_url = 'http://gyutto.com'
-base_encode = 'euc-jp'
-
-def get_movie_title(html):
- container = html.xpath("//h1")
- if len(container) > 0:
- container = container[0]
- title = container.text
-
- return title
-
-def get_movie_img(html, index = 1):
- images = []
- container = html.xpath("//a[@class='highslide']/img")
- if len(container) > 0:
- if index == 0:
- return container[0].get('src')
-
- for row in container:
- images.append(row.get('src'))
-
- return images
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'gyutto'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('GYUTTO-'):
- raise ValueError('Invalid gyutto number: ' + movie.dvdid)
- gyutto_id = id_uc.replace('GYUTTO-', '')
- # 抓取网页
- url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1'
- r = request_get(url, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = resp2html(r, base_encode)
- container = html.xpath("//dl[@class='BasicInfo clearfix']")
-
- for row in container:
- key = row.xpath(".//dt/text()")
- if key[0] == "サークル":
- producer = ''.join(row.xpath(".//dd/a/text()"))
- elif key[0] == "ジャンル":
- genre = row.xpath(".//dd/a/text()")
- elif key[0] == "配信開始日":
- date = row.xpath(".//dd/text()")
- date_str = ''.join(date)
- date_time = time.strptime(date_str, "%Y年%m月%d日")
- publish_date = time.strftime("%Y-%m-%d", date_time)
-
- plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0]
-
- movie.title = get_movie_title(html)
- movie.cover = get_movie_img(html, 0)
- movie.preview_pics = get_movie_img(html)
- movie.dvdid = id_uc
- movie.url = url
- movie.producer = producer
- # movie.actress = actress
- # movie.duration = duration
- movie.publish_date = publish_date
- movie.genre = genre
- movie.plot = plot
-
-if __name__ == "__main__":
- import pretty_errors
-
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('gyutto-266923')
-
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/jav321.py b/javsp/web/jav321.py
deleted file mode 100644
index 4e42617a5..000000000
--- a/javsp/web/jav321.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""从jav321抓取数据"""
-import re
-import logging
-
-
-from javsp.web.base import post_html
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.jav321.com'
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- html = post_html(f'{base_url}/search', data={'sn': movie.dvdid})
- page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
- #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
- cid = page_url.split('/')[-1] # /video/ipx00177
- # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
- if cid == 'search':
- raise MovieNotFoundError(__name__, movie.dvdid)
- title = html.xpath("//div[@class='panel-heading']/h3/text()")[0]
- info = html.xpath("//div[@class='col-md-9']")[0]
- # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签
- company_tags = info.xpath("a[contains(@href,'/company/')]/text()")
- if company_tags:
- movie.producer = company_tags[0]
- # actress, actress_pics
- # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白
- actress, actress_pics = [], {}
- actress_tags = html.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
- for tag in actress_tags:
- name = tag.tail.strip()
- pic_url = tag.get('src')
- actress.append(name)
- # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
- # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
- actress_pics[name] = pic_url
- # genre, genre_id
- genre_tags = info.xpath("a[contains(@href,'/genre/')]")
- genre, genre_id = [], []
- for tag in genre_tags:
- genre.append(tag.text)
- genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1
- dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
- publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
- duration_str = info.xpath("b[text()='収録時間']")[0].tail
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
- score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original")
- if score_tag:
- score = int(score_tag[0][5:7])/5 # /10*2
- movie.score = str(score)
- serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
- if serial_tag:
- movie.serial = serial_tag[0]
- preview_video_tag = info.xpath("//video/source/@src")
- if preview_video_tag:
- movie.preview_video = preview_video_tag[0]
- plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()")
- if plot_tag:
- movie.plot = plot_tag[0]
- preview_pics = html.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src")
- if len(preview_pics) == 0:
- # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL
- preview_pics = html.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src")
- # 有的图片链接里有多个//,网站质量堪忧……
- preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics]
- # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析
-
- movie.url = page_url
- movie.cid = cid
- movie.dvdid = dvdid
- movie.title = title
- movie.actress = actress
- movie.actress_pics = actress_pics
- movie.genre = genre
- movie.genre_id = genre_id
- movie.publish_date = publish_date
- # preview_pics的第一张图始终是封面,剩下的才是预览图
- if len(preview_pics) > 0:
- movie.cover = preview_pics[0]
- movie.preview_pics = preview_pics[1:]
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('SCUTE-1177')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/javbus.py b/javsp/web/javbus.py
deleted file mode 100644
index a98cd9974..000000000
--- a/javsp/web/javbus.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""从JavBus抓取数据"""
-import logging
-
-
-from javsp.web.base import *
-from javsp.web.exceptions import *
-from javsp.func import *
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo, GenreMap
-
-
-logger = logging.getLogger(__name__)
-genre_map = GenreMap('data/genre_javbus.csv')
-permanent_url = 'https://www.javbus.com'
-if Cfg().network.proxy_server is not None:
- base_url = permanent_url
-else:
- base_url = str(Cfg().network.proxy_free[CrawlerID.javbus])
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- url = f'{base_url}/{movie.dvdid}'
- resp = request_get(url, delay_raise=True)
- # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息
- if resp.history and resp.history[0].status_code == 302:
- html = resp2html(resp.history[0])
- else:
- html = resp2html(resp)
- # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404
- page_title = html.xpath('/html/head/title/text()')
- if page_title and page_title[0].startswith('404 Page Not Found!'):
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- container = html.xpath("//div[@class='container']")[0]
- title = container.xpath("h3/text()")[0]
- cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
- preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
- info = container.xpath("//div[@class='col-md-3 info']")[0]
- dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
- publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
- duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip()
- director_tag = info.xpath("p/span[text()='導演:']")
- if director_tag: # xpath没有匹配时将得到空列表
- movie.director = director_tag[0].getnext().text.strip()
- producer_tag = info.xpath("p/span[text()='製作商:']")
- if producer_tag:
- text = producer_tag[0].getnext().text
- if text:
- movie.producer = text.strip()
- publisher_tag = info.xpath("p/span[text()='發行商:']")
- if publisher_tag:
- movie.publisher = publisher_tag[0].getnext().text.strip()
- serial_tag = info.xpath("p/span[text()='系列:']")
- if serial_tag:
- movie.serial = serial_tag[0].getnext().text
- # genre, genre_id
- genre_tags = info.xpath("//span[@class='genre']/label/a")
- genre, genre_id = [], []
- for tag in genre_tags:
- tag_url = tag.get('href')
- pre_id = tag_url.split('/')[-1]
- genre.append(tag.text)
- if 'uncensored' in tag_url:
- movie.uncensored = True
- genre_id.append('uncensored-' + pre_id)
- else:
- movie.uncensored = False
- genre_id.append(pre_id)
- # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析
- # actress, actress_pics
- actress, actress_pics = [], {}
- actress_tags = html.xpath("//a[@class='avatar-box']/div/img")
- for tag in actress_tags:
- name = tag.get('title')
- pic_url = tag.get('src')
- actress.append(name)
- if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像
- actress_pics[name] = pic_url
- # 整理数据并更新movie的相应属性
- movie.url = f'{permanent_url}/{movie.dvdid}'
- movie.dvdid = dvdid
- movie.title = title.replace(dvdid, '').strip()
- movie.cover = cover
- movie.preview_pics = preview_pics
- if publish_date != '0000-00-00': # 丢弃无效的发布日期
- movie.publish_date = publish_date
- movie.duration = duration if int(duration) else None
- movie.genre = genre
- movie.genre_id = genre_id
- movie.actress = actress
- movie.actress_pics = actress_pics
-
-
-def parse_clean_data(movie: MovieInfo):
- """解析指定番号的影片数据并进行清洗"""
- parse_data(movie)
- movie.genre_norm = genre_map.map(movie.genre_id)
- movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换)
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('NANP-030')
- try:
- parse_clean_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/javdb.py b/javsp/web/javdb.py
deleted file mode 100644
index 5120aae76..000000000
--- a/javsp/web/javdb.py
+++ /dev/null
@@ -1,333 +0,0 @@
-"""从JavDB抓取数据"""
-import os
-import re
-import logging
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.func import *
-from javsp.avid import guess_av_type
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo, GenreMap
-from javsp.chromium import get_browsers_cookies
-
-
-# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析
-request = Request(use_scraper=True)
-request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'
-
-logger = logging.getLogger(__name__)
-genre_map = GenreMap('data/genre_javdb.csv')
-permanent_url = 'https://javdb.com'
-if Cfg().network.proxy_server is not None:
- base_url = permanent_url
-else:
- base_url = str(Cfg().network.proxy_free[CrawlerID.javdb])
-
-
-def get_html_wrapper(url):
- """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题"""
- global request, cookies_pool
- r = request.get(url, delay_raise=True)
- if r.status_code == 200:
- # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页
- if r.history and '/login' in r.url:
- # 仅在需要时去读取Cookies
- if 'cookies_pool' not in globals():
- try:
- cookies_pool = get_browsers_cookies()
- except (PermissionError, OSError) as e:
- logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True)
- cookies_pool = []
- except Exception as e:
- logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True)
- cookies_pool = []
- if len(cookies_pool) > 0:
- item = cookies_pool.pop()
- # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies
- request = Request(use_scraper=True)
- request.cookies = item['cookies']
- cookies_source = (item['profile'], item['site'])
- logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}')
- return get_html_wrapper(url)
- else:
- raise CredentialError('JavDB: 所有浏览器Cookies均已过期')
- elif r.history and 'pay' in r.url.split('/')[-1]:
- raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'")
- else:
- html = resp2html(r)
- return html
- elif r.status_code in (403, 503):
- html = resp2html(r)
- code_tag = html.xpath("//span[@class='code-label']/span")
- error_code = code_tag[0].text if code_tag else None
- if error_code:
- if error_code == '1020':
- block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器'
- else:
- block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})'
- else:
- block_msg = f'JavDB: {r.status_code} 禁止访问: {url}'
- raise SiteBlocked(block_msg)
- else:
- raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}')
-
-
-def get_user_info(site, cookies):
- """获取cookies对应的JavDB用户信息"""
- try:
- request.cookies = cookies
- html = request.get_html(f'https://{site}/users/profile')
- except Exception as e:
- logger.info('JavDB: 获取用户信息时出错')
- logger.debug(e, exc_info=1)
- return
- # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点
- if 'JavDB' in html.text:
- email = html.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip()
- username = html.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip()
- return email, username
- else:
- logger.debug('JavDB: 域名已过期: ' + site)
-
-
-def get_valid_cookies():
- """扫描浏览器,获取一个可用的Cookies"""
- # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用
- for d in cookies_pool:
- info = get_user_info(d['site'], d['cookies'])
- if info:
- return d['cookies']
- else:
- logger.debug(f"{d['profile']}, {d['site']}: Cookies无效")
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
- html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}')
- ids = list(map(str.lower, html.xpath("//div[@class='video-title']/strong/text()")))
- movie_urls = html.xpath("//a[@class='box']/@href")
- match_count = len([i for i in ids if i == movie.dvdid.lower()])
- if match_count == 0:
- raise MovieNotFoundError(__name__, movie.dvdid, ids)
- elif match_count == 1:
- index = ids.index(movie.dvdid.lower())
- new_url = movie_urls[index]
- try:
- html2 = get_html_wrapper(new_url)
- except (SitePermissionError, CredentialError):
- # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面
- box = html.xpath("//a[@class='box']")[index]
- movie.url = new_url
- movie.title = box.get('title')
- movie.cover = box.xpath("div/img/@src")[0]
- score_str = box.xpath("div[@class='score']/span/span")[0].tail
- score = re.search(r'([\d.]+)分', score_str).group(1)
- movie.score = "{:.2f}".format(float(score)*2)
- movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip()
- return
- else:
- raise MovieDuplicateError(__name__, movie.dvdid, match_count)
-
- container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0]
- info = container.xpath("//nav[@class='panel movie-panel-info']")[0]
- title = container.xpath("h2/strong[@class='current-title']/text()")[0]
- show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]")
- if show_orig_title:
- movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0]
- cover = container.xpath("//img[@class='video-cover']/@src")[0]
- preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href")
- preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src")
- if preview_video_tag:
- preview_video = preview_video_tag[0]
- if preview_video.startswith('//'):
- preview_video = 'https:' + preview_video
- movie.preview_video = preview_video
- dvdid = info.xpath("div/span")[0].text_content()
- publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
- duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip()
- director_tag = info.xpath("div/strong[text()='導演:']")
- if director_tag:
- movie.director = director_tag[0].getnext().text_content().strip()
- av_type = guess_av_type(movie.dvdid)
- if av_type != 'fc2':
- producer_tag = info.xpath("div/strong[text()='片商:']")
- else:
- producer_tag = info.xpath("div/strong[text()='賣家:']")
- if producer_tag:
- movie.producer = producer_tag[0].getnext().text_content().strip()
- publisher_tag = info.xpath("div/strong[text()='發行:']")
- if publisher_tag:
- movie.publisher = publisher_tag[0].getnext().text_content().strip()
- serial_tag = info.xpath("div/strong[text()='系列:']")
- if serial_tag:
- movie.serial = serial_tag[0].getnext().text_content().strip()
- score_tag = info.xpath("//span[@class='score-stars']")
- if score_tag:
- score_str = score_tag[0].tail
- score = re.search(r'([\d.]+)分', score_str).group(1)
- movie.score = "{:.2f}".format(float(score)*2)
- genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
- genre, genre_id = [], []
- for tag in genre_tags:
- pre_id = tag.get('href').split('/')[-1]
- genre.append(tag.text)
- genre_id.append(pre_id)
- # 判定影片有码/无码
- subsite = pre_id.split('?')[0]
- movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite)
- # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
- actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
- all_actors = actors_tag.xpath("a/text()")
- genders = actors_tag.xpath("strong/text()")
- actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
- magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href")
-
- movie.dvdid = dvdid
- movie.url = new_url.replace(base_url, permanent_url)
- movie.title = title.replace(dvdid, '').strip()
- movie.cover = cover
- movie.preview_pics = preview_pics
- movie.publish_date = publish_date
- movie.duration = duration
- movie.genre = genre
- movie.genre_id = genre_id
- movie.actress = actress
- movie.magnet = [i.replace('[javdb.com]','') for i in magnet]
-
-
-def parse_clean_data(movie: MovieInfo):
- """解析指定番号的影片数据并进行清洗"""
- try:
- parse_data(movie)
- # 检查封面URL是否真的存在对应图片
- if movie.cover is not None:
- r = request.head(movie.cover)
- if r.status_code != 200:
- movie.cover = None
- except SiteBlocked:
- raise
- logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试')
- if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')):
- movie.genre_norm = genre_map.map(movie.genre_id)
- movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
-
-
-def collect_actress_alias(type=0, use_original=True):
- """
- 收集女优的别名
- type: 0-有码, 1-无码, 2-欧美
- use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬
- """
- import json
- import time
- import random
-
- actressAliasMap = {}
-
- actressAliasFilePath = "data/actress_alias.json"
- # 检查文件是否存在
- if not os.path.exists(actressAliasFilePath):
- # 如果文件不存在,创建文件并写入空字典
- with open(actressAliasFilePath, "w", encoding="utf-8") as file:
- json.dump({}, file)
-
- typeList = ["censored", "uncensored", "western"]
- page_url = f"{base_url}/actors/{typeList[type]}"
- while True:
- try:
- html = get_html_wrapper(page_url)
- actors = html.xpath("//div[@class='box actor-box']/a")
-
- count = 0
- for actor in actors:
- count += 1
- actor_name = actor.xpath("strong/text()")[0].strip()
- actor_url = actor.xpath("@href")[0]
- # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL
-
- # 进入演员主页,获取更多信息
- actor_html = get_html_wrapper(actor_url)
- # 解析演员所有名字信息
- names_span = actor_html.xpath("//span[@class='actor-section-name']")[0]
- aliases_span_list = actor_html.xpath("//span[@class='section-meta']")
- aliases_span = aliases_span_list[0]
-
- names_list = [name.strip() for name in names_span.text.split(",")]
- if len(aliases_span_list) > 1:
- aliases_list = [
- alias.strip() for alias in aliases_span.text.split(",")
- ]
- else:
- aliases_list = []
-
- # 将信息添加到actressAliasMap中
- actressAliasMap[names_list[-1 if use_original else 0]] = (
- names_list + aliases_list
- )
- print(
- f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}"
- )
-
- if count == 10:
- # 将数据写回文件
- with open(actressAliasFilePath, "r", encoding="utf-8") as file:
- existing_data = json.load(file)
-
- # 合并现有数据和新爬取的数据
- existing_data.update(actressAliasMap)
-
- # 将合并后的数据写回文件
- with open(actressAliasFilePath, "w", encoding="utf-8") as file:
- json.dump(existing_data, file, ensure_ascii=False, indent=2)
-
- actressAliasMap = {} # 重置actressAliasMap
-
- print(
- f"已爬取 {count} 个女优,数据已更新并写回文件:",
- actressAliasFilePath,
- )
-
- # 重置计数器
- count = 0
-
- time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒
-
- # 判断是否有下一页按钮
- next_page_link = html.xpath(
- "//a[@rel='next' and @class='pagination-next']/@href"
- )
- if not next_page_link:
- break # 没有下一页,结束循环
- else:
- next_page_url = f"{next_page_link[0]}"
- page_url = next_page_url
-
- except SiteBlocked:
- raise
-
- with open(actressAliasFilePath, "r", encoding="utf-8") as file:
- existing_data = json.load(file)
-
- # 合并现有数据和新爬取的数据
- existing_data.update(actressAliasMap)
-
- # 将合并后的数据写回文件
- with open(actressAliasFilePath, "w", encoding="utf-8") as file:
- json.dump(existing_data, file, ensure_ascii=False, indent=2)
-
- print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath)
-
-
-if __name__ == "__main__":
- # collect_actress_alias()
- movie = MovieInfo('FC2-2735981')
- try:
- parse_clean_data(movie)
- print(movie)
- except CrawlerError as e:
- print(repr(e))
diff --git a/javsp/web/javlib.py b/javsp/web/javlib.py
deleted file mode 100644
index 85f77b75f..000000000
--- a/javsp/web/javlib.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""从JavLibrary抓取数据"""
-import logging
-from urllib.parse import urlsplit
-
-
-from javsp.web.base import Request, read_proxy, resp2html
-from javsp.web.exceptions import *
-from javsp.web.proxyfree import get_proxy_free_url
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo
-
-
-# 初始化Request实例
-request = Request(use_scraper=True)
-
-logger = logging.getLogger(__name__)
-permanent_url = 'https://www.javlibrary.com'
-base_url = ''
-
-
-def init_network_cfg():
- """设置合适的代理模式和base_url"""
- request.timeout = 5
- proxy_free_url = get_proxy_free_url('javlib')
- urls = [str(Cfg().network.proxy_free[CrawlerID.javlib]), permanent_url]
- if proxy_free_url and proxy_free_url not in urls:
- urls.insert(1, proxy_free_url)
- # 使用代理容易触发IUAM保护,先尝试不使用代理访问
- proxy_cfgs = [{}, read_proxy()] if Cfg().network.proxy_server else [{}]
- for proxies in proxy_cfgs:
- request.proxies = proxies
- for url in urls:
- if proxies == {} and url == permanent_url:
- continue
- try:
- resp = request.get(url, delay_raise=True)
- if resp.status_code == 200:
- request.timeout = Cfg().network.timeout.seconds
- return url
- except Exception as e:
- logger.debug(f"Fail to connect to '{url}': {e}")
- logger.warning('无法绕开JavLib的反爬机制')
- request.timeout = Cfg().network.timeout.seconds
- return permanent_url
-
-
-# TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- global base_url
- if not base_url:
- base_url = init_network_cfg()
- logger.debug(f"JavLib网络配置: {base_url}, proxy={request.proxies}")
- url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
- resp = request.get(url)
- html = resp2html(resp)
- if resp.history:
- if urlsplit(resp.url).netloc == urlsplit(base_url).netloc:
- # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果
- new_url = resp.url
- else:
- # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段,
- # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据
- base_url = 'https://' + urlsplit(resp.url).netloc
- logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}")
- return parse_data(movie)
- else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
- video_tags = html.xpath("//div[@class='video'][@id]/a")
- # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
- pre_choose = []
- for tag in video_tags:
- tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
- if tag_dvdid.upper() == movie.dvdid.upper():
- pre_choose.append(tag)
- pre_choose_urls = [i.get('href') for i in pre_choose]
- match_count = len(pre_choose)
- if match_count == 0:
- raise MovieNotFoundError(__name__, movie.dvdid)
- elif match_count == 1:
- new_url = pre_choose_urls[0]
- elif match_count == 2:
- no_blueray = []
- for tag in pre_choose:
- if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc
- no_blueray.append(tag)
- no_blueray_count = len(no_blueray)
- if no_blueray_count == 1:
- new_url = no_blueray[0].get('href')
- logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
- else:
- # 两个结果中没有谁是蓝光影片,说明影片番号重复了
- raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
- else:
- # 存在不同影片但是番号相同的情况,如MIDV-010
- raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
- # 重新抓取网页
- html = request.get_html(new_url)
- container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0]
- title_tag = container.xpath("div/h3/a/text()")
- title = title_tag[0]
- cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
- info = container.xpath("//div[@id='video_info']")[0]
- dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
- publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
- duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
- director_tag = info.xpath("//span[@class='director']/a/text()")
- if director_tag:
- movie.director = director_tag[0]
- producer = info.xpath("//span[@class='maker']/a/text()")[0]
- publisher_tag = info.xpath("//span[@class='label']/a/text()")
- if publisher_tag:
- movie.publisher = publisher_tag[0]
- score_tag = info.xpath("//span[@class='score']/text()")
- if score_tag:
- movie.score = score_tag[0].strip('()')
- genre = info.xpath("//span[@class='genre']/a/text()")
- actress = info.xpath("//span[@class='star']/a/text()")
-
- movie.dvdid = dvdid
- movie.url = new_url.replace(base_url, permanent_url)
- movie.title = title.replace(dvdid, '').strip()
- if cover.startswith('//'): # 补全URL中缺少的协议段
- cover = 'https:' + cover
- movie.cover = cover
- movie.publish_date = publish_date
- movie.duration = duration
- movie.producer = producer
- movie.genre = genre
- movie.actress = actress
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- base_url = permanent_url
- movie = MovieInfo('IPX-177')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- print(e)
diff --git a/javsp/web/javmenu.py b/javsp/web/javmenu.py
deleted file mode 100644
index 5296a69cd..000000000
--- a/javsp/web/javmenu.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""从JavMenu抓取数据"""
-import logging
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-
-request = Request()
-
-logger = logging.getLogger(__name__)
-base_url = 'https://mrzyx.xyz'
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- # JavMenu网页做得很不走心,将就了
- url = f'{base_url}/{movie.dvdid}'
- r = request.get(url)
- if r.history:
- # 被重定向到主页说明找不到影片资源
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- html = resp2html(r)
- container = html.xpath("//div[@class='col-md-9 px-0']")[0]
- title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0]
- # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站
- title = title.replace(' | JAV目錄大全 | 每日更新', '')
- title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '')
- cover_tag = container.xpath("//div[@class='single-video']")
- if len(cover_tag) > 0:
- video_tag = cover_tag[0].find('video')
- # URL首尾竟然也有空格……
- movie.cover = video_tag.get('data-poster').strip()
- # 预览影片改为blob了,无法获取
- # movie.preview_video = video_tag.find('source').get('src').strip()
- else:
- cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src")
- if cover_img_tag:
- movie.cover = cover_img_tag[0].strip()
- info = container.xpath("//div[@class='card-body']")[0]
- publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text
- duration = info.xpath("div/span[contains(text(), '時長:')]")[0].getnext().text.replace('分鐘', '')
- producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()")
- if producer:
- movie.producer = producer[0]
- genre_tags = info.xpath("//a[@class='genre']")
- genre, genre_id = [], []
- for tag in genre_tags:
- items = tag.get('href').split('/')
- pre_id = items[-3] + '/' + items[-1]
- genre.append(tag.text.strip())
- genre_id.append(pre_id)
- # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠……
- actress = info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") or None
- magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody")
- if magnet_table:
- magnet_links = magnet_table[0].xpath("tr/td/a/@href")
- # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以
- movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links]
- preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href")
-
- if (not movie.cover) and preview_pics:
- movie.cover = preview_pics[0]
- movie.url = url
- movie.title = title.replace(movie.dvdid, '').strip()
- movie.preview_pics = preview_pics
- movie.publish_date = publish_date
- movie.duration = duration
- movie.genre = genre
- movie.genre_id = genre_id
- movie.actress = actress
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-718323')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/mgstage.py b/javsp/web/mgstage.py
deleted file mode 100644
index 4904e51db..000000000
--- a/javsp/web/mgstage.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""从蚊香社-mgstage抓取数据"""
-import re
-import logging
-
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.mgstage.com'
-# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
-request = Request()
-request.cookies = {'adc': '1'}
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- url = f'{base_url}/product/product_detail/{movie.dvdid}/'
- resp = request.get(url, delay_raise=True)
- if resp.status_code == 403:
- raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
- # url不存在时会被重定向至主页。history非空时说明发生了重定向
- elif resp.history:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- html = resp2html(resp)
- # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除
- title = html.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip()
- container = html.xpath("//div[@class='detail_left']")[0]
- cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
- # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
- actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()")
- actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()")
- actress = [i.strip() for i in actress_text + actress_link]
- actress = [i for i in actress if i] # 移除空字符串
- producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
- duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0]
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0]
- date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0]
- publish_date = date_str.replace('/', '-')
- serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()")
- if serial_tag:
- movie.serial = serial_tag[0].strip()
- # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
- # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip()
- genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a")
- genre = [i.text.strip() for i in genre_tags]
- score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
- match = re.search(r'^[\.\d]+', score_str)
- if match:
- score = float(match.group()) * 2
- movie.score = f'{score:.2f}'
- # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
- plots = []
- plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]")
- for p in plot_p_tags:
- children = p.getchildren()
- # 没有children时表明plot不含有格式,此时简单地提取文本就可以
- if not children:
- plots.append(p.text_content())
- continue
- for child in children:
- if child.tag == 'br' and plots[-1] != '\n':
- plots.append('\n')
- else:
- if child.text:
- plots.append(child.text)
- if child.tail:
- plots.append(child.tail)
- plot = ''.join(plots).strip()
- preview_pics = container.xpath("//a[@class='sample_image']/@href")
-
- if Cfg().crawler.hardworking:
- # 预览视频是点击按钮后再加载的,不在静态网页中
- btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
- video_pid = btn_url.split('/')[-1]
- req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
- resp = request.get(req_url).json()
- video_url = resp.get('url')
- if video_url:
- # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
- preview_video = video_url.split('.ism/')[0] + '.mp4'
- movie.preview_video = preview_video
-
- movie.dvdid = dvdid
- movie.url = url
- movie.title = title
- movie.cover = cover
- movie.actress = actress
- movie.producer = producer
- movie.publish_date = publish_date
- movie.genre = genre
- movie.plot = plot
- movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('HRV-045')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/njav.py b/javsp/web/njav.py
deleted file mode 100644
index f94e943f3..000000000
--- a/javsp/web/njav.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""从NJAV抓取数据"""
-import re
-import logging
-from typing import List
-
-
-from javsp.web.base import get_html
-from javsp.web.exceptions import *
-from javsp.lib import strftime_to_minutes
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://njav.tv/ja'
-
-def search_video(movie: MovieInfo):
- id_uc = movie.dvdid
- # 抓取网页
- url = f'{base_url}/search?keyword={id_uc}'
- html = get_html(url)
- list = html.xpath("//div[@class='box-item']/div[@class='detail']/a")
- video_url = None
- for item in list:
- search_title = item.xpath("text()")[0]
- if id_uc in search_title:
- video_url = item.xpath("@href")
- break
- if id_uc.startswith("FC2-"):
- fc2id = id_uc.replace('FC2-', '')
- if "FC2" in search_title and fc2id in search_title:
- video_url = item.xpath("@href")
- break
-
- return get_list_first(video_url)
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 抓取网页
- url = search_video(movie)
- if not url:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = get_html(url)
- container = html.xpath("//div[@class='container']/div/div[@class='col']")
- if len(container) > 0:
- container = container[0]
- else:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0]
- thumb_pic = container.xpath("//div[@id='player']/@data-poster")
- plot = " ".join(container.xpath("//div[@class='description']/p/text()"))
- magnet = container.xpath("//div[@class='magnet']/a/@href")
- real_id = None
- publish_date = None
- duration_str = None
- uncensored = None
- preview_pics = None
- preview_video = None
- serial = None
- publisher = None
- producer = None
- genre = []
- actress = []
-
- detail_dic = {}
- for item in container.xpath("//div[@class='detail-item']/div"):
- item_title = item.xpath('span/text()')[0]
- if "タグ:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
- elif "ジャンル:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
- elif "レーベル:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
- elif "女優:" in item_title:
- actress = item.xpath("span")[1].xpath("a/text()")
- elif "シリーズ:" in item_title:
- serial = get_list_first(item.xpath("span")[1].xpath("a/text()"))
- elif "メーカー:" in item_title:
- producer = get_list_first(item.xpath("span")[1].xpath("a/text()"))
- elif "コード:" in item_title:
- real_id = get_list_first(item.xpath("span")[1].xpath("text()"))
- elif "公開日:" in item_title:
- publish_date = get_list_first(item.xpath("span")[1].xpath("text()"))
- elif "再生時間:" in item_title:
- duration_str = get_list_first(item.xpath("span")[1].xpath("text()"))
-
- # 清除标题里的番号字符
- keywords = [real_id, " "]
- if movie.dvdid.startswith("FC2"):
- keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]]
- for keyword in keywords:
- title = re.sub(re.escape(keyword), "", title, flags=re.I)
-
- # 判断是否无码
- uncensored_arr = magnet + [title]
- for uncensored_str in uncensored_arr:
- if 'uncensored' in uncensored_str.lower():
- uncensored = True
-
- movie.url = url
- movie.title = title
- movie.genre = genre
- movie.actress = actress
- movie.duration = str(strftime_to_minutes(duration_str))
- movie.publish_date = publish_date
- movie.publisher = publisher
- movie.producer = producer
- movie.uncensored = uncensored
- movie.preview_pics = preview_pics
- movie.preview_video = preview_video
- movie.plot = plot
- movie.serial = serial
- movie.magnet = magnet
-
- # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
- if movie.preview_pics:
- movie.cover = preview_pics[0]
- else:
- movie.cover = get_list_first(thumb_pic)
-
-def get_list_first(list:List):
- return list[0] if list and len(list) > 0 else None
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('012023_002')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/prestige.py b/javsp/web/prestige.py
deleted file mode 100644
index f6884c658..000000000
--- a/javsp/web/prestige.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""从蚊香社-prestige抓取数据"""
-import re
-import logging
-
-
-from javsp.web.base import *
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.prestige-av.com'
-# prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面
-# (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取)
-cookies = {'__age_auth__': 'true'}
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}'
- resp = request_get(url, cookies=cookies, delay_raise=True)
- if resp.status_code == 500:
- # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试
- raise MovieNotFoundError(__name__, movie.dvdid)
- elif resp.status_code == 403:
- raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
- resp.raise_for_status()
- html = resp2html(resp)
- container_tags = html.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']")
- if not container_tags:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- container = container_tags[0]
- title = container.xpath("h1/span")[0].tail.strip()
- cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0]
- cover = cover.split('?')[0]
- actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()")
- # 移除女优名中的空格,使女优名与其他网站保持一致
- actress = [i.strip().replace(' ', '') for i in actress]
- duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content()
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0]
- publish_date = date_url.split('?date=')[-1]
- producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip()
- dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0]
- genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a")
- genre = [tag.text.strip() for tag in genre_tags]
- serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip()
- plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip()
- preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src")
- preview_pics = [i.split('?')[0] for i in preview_pics]
-
- # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效
- movie.url = url
- movie.dvdid = dvdid
- movie.title = title
- movie.cover = cover
- movie.actress = actress
- movie.publish_date = publish_date
- movie.producer = producer
- movie.genre = genre
- movie.serial = serial
- movie.plot = plot
- movie.preview_pics = preview_pics
- movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('ABP-647')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/proxyfree.py b/javsp/web/proxyfree.py
deleted file mode 100644
index 89c1e63a4..000000000
--- a/javsp/web/proxyfree.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""获取各个网站的免代理地址"""
-import re
-import sys
-
-from javsp.web.base import is_connectable, get_html, get_resp_text, request_get
-
-
-def get_proxy_free_url(site_name: str, prefer_url=None) -> str:
- """获取指定网站的免代理地址
- Args:
- site_name (str): 站点名称
- prefer_url (str, optional): 优先测试此url是否可用
- Returns:
- str: 指定站点的免代理地址(失败时为空字符串)
- """
- if prefer_url and is_connectable(prefer_url, timeout=5):
- return prefer_url
- # 当prefer_url不可用时,尝试自动获取指定网站的免代理地址
- site_name = site_name.lower()
- func_name = f'_get_{site_name}_urls'
- get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith('_get_')]
- if func_name in get_funcs:
- get_urls = getattr(sys.modules[__name__], func_name)
- try:
- urls = get_urls()
- return _choose_one(urls)
- except:
- return ''
- else:
- raise Exception("Dont't know how to get proxy-free url for " + site_name)
-
-
-def _choose_one(urls) -> str:
- for url in urls:
- if is_connectable(url, timeout=5):
- return url
- return ''
-
-
-def _get_avsox_urls() -> list:
- html = get_html('https://tellme.pw/avsox')
- urls = html.xpath('//h4/strong/a/@href')
- return urls
-
-
-def _get_javbus_urls() -> list:
- html = get_html('https://www.javbus.one/')
- text = html.text_content()
- urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
- return urls
-
-
-def _get_javlib_urls() -> list:
- html = get_html('https://github.com/javlibcom')
- text = html.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
- match = re.search(r'[\w\.]+', text, re.A)
- if match:
- domain = f'https://www.{match.group(0)}.com'
- return [domain]
-
-
-def _get_javdb_urls() -> list:
- html = get_html('https://jav524.app')
- js_links = html.xpath("//script[@src]/@src")
- for link in js_links:
- if '/js/index' in link:
- text = get_resp_text(request_get(link))
- match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
- if match:
- return [match.group(1)]
-
-
-if __name__ == "__main__":
- print('javdb:\t', _get_javdb_urls())
- print('javlib:\t', _get_javlib_urls())
diff --git a/poetry.lock b/poetry.lock
index 1c92293a3..5d679f751 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,5 +1,193 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+[[package]]
+name = "aiofiles"
+version = "24.1.0"
+description = "File support for asyncio."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5"},
+ {file = "aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.4.2"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "aiohappyeyeballs-2.4.2-py3-none-any.whl", hash = "sha256:8522691d9a154ba1145b157d6d5c15e5c692527ce6a53c5e5f9876977f6dab2f"},
+ {file = "aiohappyeyeballs-2.4.2.tar.gz", hash = "sha256:4ca893e6c5c1f5bf3888b04cb5a3bee24995398efef6e0b9f747b5e89d84fd74"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "aiohttp"
+version = "3.10.8"
+description = "Async http client/server framework (asyncio)"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "aiohttp-3.10.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a1ba7bc139592339ddeb62c06486d0fa0f4ca61216e14137a40d626c81faf10c"},
+ {file = "aiohttp-3.10.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85e4d7bd05d18e4b348441e7584c681eff646e3bf38f68b2626807f3add21aa2"},
+ {file = "aiohttp-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:69de056022e7abf69cb9fec795515973cc3eeaff51e3ea8d72a77aa933a91c52"},
+ {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee3587506898d4a404b33bd19689286ccf226c3d44d7a73670c8498cd688e42c"},
+ {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fe285a697c851734285369614443451462ce78aac2b77db23567507484b1dc6f"},
+ {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10c7932337285a6bfa3a5fe1fd4da90b66ebfd9d0cbd1544402e1202eb9a8c3e"},
+ {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd9716ef0224fe0d0336997eb242f40619f9f8c5c57e66b525a1ebf9f1d8cebe"},
+ {file = "aiohttp-3.10.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ceacea31f8a55cdba02bc72c93eb2e1b77160e91f8abd605969c168502fd71eb"},
+ {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9721554bfa9e15f6e462da304374c2f1baede3cb06008c36c47fa37ea32f1dc4"},
+ {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:22cdeb684d8552490dd2697a5138c4ecb46f844892df437aaf94f7eea99af879"},
+ {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e56bb7e31c4bc79956b866163170bc89fd619e0581ce813330d4ea46921a4881"},
+ {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:3a95d2686bc4794d66bd8de654e41b5339fab542b2bca9238aa63ed5f4f2ce82"},
+ {file = "aiohttp-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d82404a0e7b10e0d7f022cf44031b78af8a4f99bd01561ac68f7c24772fed021"},
+ {file = "aiohttp-3.10.8-cp310-cp310-win32.whl", hash = "sha256:4e10b04542d27e21538e670156e88766543692a0a883f243ba8fad9ddea82e53"},
+ {file = "aiohttp-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:680dbcff5adc7f696ccf8bf671d38366a1f620b5616a1d333d0cb33956065395"},
+ {file = "aiohttp-3.10.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:33a68011a38020ed4ff41ae0dbf4a96a202562ecf2024bdd8f65385f1d07f6ef"},
+ {file = "aiohttp-3.10.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c7efa6616a95e3bd73b8a69691012d2ef1f95f9ea0189e42f338fae080c2fc6"},
+ {file = "aiohttp-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddb9b9764cfb4459acf01c02d2a59d3e5066b06a846a364fd1749aa168efa2be"},
+ {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c7f270f4ca92760f98a42c45a58674fff488e23b144ec80b1cc6fa2effed377"},
+ {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6984dda9d79064361ab58d03f6c1e793ea845c6cfa89ffe1a7b9bb400dfd56bd"},
+ {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f6d47e392c27206701565c8df4cac6ebed28fdf6dcaea5b1eea7a4631d8e6db"},
+ {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a72f89aea712c619b2ca32c6f4335c77125ede27530ad9705f4f349357833695"},
+ {file = "aiohttp-3.10.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36074b26f3263879ba8e4dbd33db2b79874a3392f403a70b772701363148b9f"},
+ {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e32148b4a745e70a255a1d44b5664de1f2e24fcefb98a75b60c83b9e260ddb5b"},
+ {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5aa1a073514cf59c81ad49a4ed9b5d72b2433638cd53160fd2f3a9cfa94718db"},
+ {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d3a79200a9d5e621c4623081ddb25380b713c8cf5233cd11c1aabad990bb9381"},
+ {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e45fdfcb2d5bcad83373e4808825b7512953146d147488114575780640665027"},
+ {file = "aiohttp-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f78e2a78432c537ae876a93013b7bc0027ba5b93ad7b3463624c4b6906489332"},
+ {file = "aiohttp-3.10.8-cp311-cp311-win32.whl", hash = "sha256:f8179855a4e4f3b931cb1764ec87673d3fbdcca2af496c8d30567d7b034a13db"},
+ {file = "aiohttp-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:ef9b484604af05ca745b6108ca1aaa22ae1919037ae4f93aaf9a37ba42e0b835"},
+ {file = "aiohttp-3.10.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ab2d6523575fc98896c80f49ac99e849c0b0e69cc80bf864eed6af2ae728a52b"},
+ {file = "aiohttp-3.10.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f5d5d5401744dda50b943d8764508d0e60cc2d3305ac1e6420935861a9d544bc"},
+ {file = "aiohttp-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de23085cf90911600ace512e909114385026b16324fa203cc74c81f21fd3276a"},
+ {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4618f0d2bf523043866a9ff8458900d8eb0a6d4018f251dae98e5f1fb699f3a8"},
+ {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:21c1925541ca84f7b5e0df361c0a813a7d6a56d3b0030ebd4b220b8d232015f9"},
+ {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:497a7d20caea8855c5429db3cdb829385467217d7feb86952a6107e033e031b9"},
+ {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c887019dbcb4af58a091a45ccf376fffe800b5531b45c1efccda4bedf87747ea"},
+ {file = "aiohttp-3.10.8-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40d2d719c3c36a7a65ed26400e2b45b2d9ed7edf498f4df38b2ae130f25a0d01"},
+ {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:57359785f27394a8bcab0da6dcd46706d087dfebf59a8d0ad2e64a4bc2f6f94f"},
+ {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a961ee6f2cdd1a2be4735333ab284691180d40bad48f97bb598841bfcbfb94ec"},
+ {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:fe3d79d6af839ffa46fdc5d2cf34295390894471e9875050eafa584cb781508d"},
+ {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9a281cba03bdaa341c70b7551b2256a88d45eead149f48b75a96d41128c240b3"},
+ {file = "aiohttp-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c6769d71bfb1ed60321363a9bc05e94dcf05e38295ef41d46ac08919e5b00d19"},
+ {file = "aiohttp-3.10.8-cp312-cp312-win32.whl", hash = "sha256:a3081246bab4d419697ee45e555cef5cd1def7ac193dff6f50be761d2e44f194"},
+ {file = "aiohttp-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:ab1546fc8e00676febc81c548a876c7bde32f881b8334b77f84719ab2c7d28dc"},
+ {file = "aiohttp-3.10.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b1a012677b8e0a39e181e218de47d6741c5922202e3b0b65e412e2ce47c39337"},
+ {file = "aiohttp-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2df786c96c57cd6b87156ba4c5f166af7b88f3fc05f9d592252fdc83d8615a3c"},
+ {file = "aiohttp-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8885ca09d3a9317219c0831276bfe26984b17b2c37b7bf70dd478d17092a4772"},
+ {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4dbf252ac19860e0ab56cd480d2805498f47c5a2d04f5995d8d8a6effd04b48c"},
+ {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b2036479b6b94afaaca7d07b8a68dc0e67b0caf5f6293bb6a5a1825f5923000"},
+ {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:365783e1b7c40b59ed4ce2b5a7491bae48f41cd2c30d52647a5b1ee8604c68ad"},
+ {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:270e653b5a4b557476a1ed40e6b6ce82f331aab669620d7c95c658ef976c9c5e"},
+ {file = "aiohttp-3.10.8-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8960fabc20bfe4fafb941067cda8e23c8c17c98c121aa31c7bf0cdab11b07842"},
+ {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f21e8f2abed9a44afc3d15bba22e0dfc71e5fa859bea916e42354c16102b036f"},
+ {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:fecd55e7418fabd297fd836e65cbd6371aa4035a264998a091bbf13f94d9c44d"},
+ {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:badb51d851358cd7535b647bb67af4854b64f3c85f0d089c737f75504d5910ec"},
+ {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e860985f30f3a015979e63e7ba1a391526cdac1b22b7b332579df7867848e255"},
+ {file = "aiohttp-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:71462f8eeca477cbc0c9700a9464e3f75f59068aed5e9d4a521a103692da72dc"},
+ {file = "aiohttp-3.10.8-cp313-cp313-win32.whl", hash = "sha256:177126e971782769b34933e94fddd1089cef0fe6b82fee8a885e539f5b0f0c6a"},
+ {file = "aiohttp-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:98a4eb60e27033dee9593814ca320ee8c199489fbc6b2699d0f710584db7feb7"},
+ {file = "aiohttp-3.10.8-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ffef3d763e4c8fc97e740da5b4d0f080b78630a3914f4e772a122bbfa608c1db"},
+ {file = "aiohttp-3.10.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:597128cb7bc5f068181b49a732961f46cb89f85686206289d6ccb5e27cb5fbe2"},
+ {file = "aiohttp-3.10.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f23a6c1d09de5de89a33c9e9b229106cb70dcfdd55e81a3a3580eaadaa32bc92"},
+ {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da57af0c54a302b7c655fa1ccd5b1817a53739afa39924ef1816e7b7c8a07ccb"},
+ {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e7a6af57091056a79a35104d6ec29d98ec7f1fb7270ad9c6fff871b678d1ff8"},
+ {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32710d6b3b6c09c60c794d84ca887a3a2890131c0b02b3cefdcc6709a2260a7c"},
+ {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b91f4f62ad39a8a42d511d66269b46cb2fb7dea9564c21ab6c56a642d28bff5"},
+ {file = "aiohttp-3.10.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:471a8c47344b9cc309558b3fcc469bd2c12b49322b4b31eb386c4a2b2d44e44a"},
+ {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:fc0e7f91705445d79beafba9bb3057dd50830e40fe5417017a76a214af54e122"},
+ {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:85431c9131a9a0f65260dc7a65c800ca5eae78c4c9931618f18c8e0933a0e0c1"},
+ {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:b91557ee0893da52794b25660d4f57bb519bcad8b7df301acd3898f7197c5d81"},
+ {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:4954e6b06dd0be97e1a5751fc606be1f9edbdc553c5d9b57d72406a8fbd17f9d"},
+ {file = "aiohttp-3.10.8-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:a087c84b4992160ffef7afd98ef24177c8bd4ad61c53607145a8377457385100"},
+ {file = "aiohttp-3.10.8-cp38-cp38-win32.whl", hash = "sha256:e1f0f7b27171b2956a27bd8f899751d0866ddabdd05cbddf3520f945130a908c"},
+ {file = "aiohttp-3.10.8-cp38-cp38-win_amd64.whl", hash = "sha256:c4916070e12ae140110aa598031876c1bf8676a36a750716ea0aa5bd694aa2e7"},
+ {file = "aiohttp-3.10.8-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5284997e3d88d0dfb874c43e51ae8f4a6f4ca5b90dcf22995035187253d430db"},
+ {file = "aiohttp-3.10.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9443d9ebc5167ce1fbb552faf2d666fb22ef5716a8750be67efd140a7733738c"},
+ {file = "aiohttp-3.10.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b667e2a03407d79a76c618dc30cedebd48f082d85880d0c9c4ec2faa3e10f43e"},
+ {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98fae99d5c2146f254b7806001498e6f9ffb0e330de55a35e72feb7cb2fa399b"},
+ {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8296edd99d0dd9d0eb8b9e25b3b3506eef55c1854e9cc230f0b3f885f680410b"},
+ {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1ce46dfb49cfbf9e92818be4b761d4042230b1f0e05ffec0aad15b3eb162b905"},
+ {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c38cfd355fd86c39b2d54651bd6ed7d63d4fe3b5553f364bae3306e2445f847"},
+ {file = "aiohttp-3.10.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:713dff3f87ceec3bde4f3f484861464e722cf7533f9fa6b824ec82bb5a9010a7"},
+ {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:21a72f4a9c69a8567a0aca12042f12bba25d3139fd5dd8eeb9931f4d9e8599cd"},
+ {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6d1ad868624f6cea77341ef2877ad4e71f7116834a6cd7ec36ec5c32f94ee6ae"},
+ {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:a78ba86d5a08207d1d1ad10b97aed6ea48b374b3f6831d02d0b06545ac0f181e"},
+ {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:aff048793d05e1ce05b62e49dccf81fe52719a13f4861530706619506224992b"},
+ {file = "aiohttp-3.10.8-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d088ca05381fd409793571d8e34eca06daf41c8c50a05aeed358d2d340c7af81"},
+ {file = "aiohttp-3.10.8-cp39-cp39-win32.whl", hash = "sha256:ee97c4e54f457c366e1f76fbbf3e8effee9de57dae671084a161c00f481106ce"},
+ {file = "aiohttp-3.10.8-cp39-cp39-win_amd64.whl", hash = "sha256:d95ae4420669c871667aad92ba8cce6251d61d79c1a38504621094143f94a8b4"},
+ {file = "aiohttp-3.10.8.tar.gz", hash = "sha256:21f8225f7dc187018e8433c9326be01477fb2810721e048b33ac49091b19fb4a"},
+]
+
+[package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
+aiosignal = ">=1.1.2"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
+attrs = ">=17.3.0"
+frozenlist = ">=1.1.1"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.12.0,<2.0"
+
+[package.extras]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "aiohttp-socks"
+version = "0.9.0"
+description = "Proxy connector for aiohttp"
+optional = false
+python-versions = "*"
+files = [
+ {file = "aiohttp_socks-0.9.0-py3-none-any.whl", hash = "sha256:90a8211fd5b904ccbd010900105f1fd2dab20ae8a07df508df399036ad8d3d88"},
+ {file = "aiohttp_socks-0.9.0.tar.gz", hash = "sha256:22159a1af026b229cfe5ea007e065bb3fe56385a951a82623a6f4588a6758003"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.10.0"
+python-socks = {version = ">=2.4.3,<3.0.0", extras = ["asyncio"]}
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "aiosignal"
+version = "1.3.1"
+description = "aiosignal: a list of registered asynchronous callbacks"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
+ {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+]
+
+[package.dependencies]
+frozenlist = ">=1.1.0"
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "annotated-types"
version = "0.7.0"
@@ -17,14 +205,14 @@ url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
[[package]]
-name = "certifi"
-version = "2024.8.30"
-description = "Python package for providing Mozilla's CA Bundle."
+name = "async-timeout"
+version = "4.0.3"
+description = "Timeout context manager for asyncio programs"
optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
files = [
- {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
- {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
+ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
+ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
]
[package.source]
@@ -32,6 +220,30 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "attrs"
+version = "24.2.0"
+description = "Classes Without Boilerplate"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
+ {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
+]
+
+[package.extras]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "cffi"
version = "1.17.1"
@@ -116,131 +328,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "charset-normalizer"
-version = "3.3.2"
-description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-optional = false
-python-versions = ">=3.7.0"
-files = [
- {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
- {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
-[[package]]
-name = "cloudscraper"
-version = "1.2.71"
-description = "A Python module to bypass Cloudflare's anti-bot page."
-optional = false
-python-versions = "*"
-files = [
- {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
- {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
-]
-
-[package.dependencies]
-pyparsing = ">=2.4.7"
-requests = ">=2.9.2"
-requests-toolbelt = ">=0.9.1"
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "colorama"
version = "0.4.4"
@@ -570,6 +657,97 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "frozenlist"
+version = "1.4.1"
+description = "A list-like structure which implements collections.abc.MutableSequence"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
+ {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
+ {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
+ {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
+ {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
+ {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
+ {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
+ {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
+ {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
+ {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
+ {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
+ {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
+ {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
+ {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
+ {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
+ {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
+ {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
+ {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
+ {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
+ {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
+ {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
+ {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
+ {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
+ {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
+ {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
+ {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
+ {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
+ {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
+ {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
+ {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
+ {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
+ {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
+ {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
+ {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
+ {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
+ {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
+ {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
+ {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
+ {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
+ {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
+ {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
+ {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
+ {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
+ {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
+ {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
+ {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
+ {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
+ {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
+ {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
+ {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
+ {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
+ {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
+ {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
+ {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
+ {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
+ {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
+ {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
+ {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
+ {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
+ {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
+ {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
+ {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
+ {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
+ {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
+ {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
+ {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
+ {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
+ {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
+ {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
+ {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
+ {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
+ {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
+ {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
+ {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
+ {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
+ {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
+ {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "idna"
version = "3.10"
@@ -905,6 +1083,115 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "multidict"
+version = "6.1.0"
+description = "multidict implementation"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
+ {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
+ {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
+ {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
+ {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
+ {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
+ {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
+ {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
+ {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
+ {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
+ {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
+ {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
+ {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
+ {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
+ {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
+ {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
+ {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
+ {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
+ {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
+ {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
+ {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
+ {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
+ {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
+ {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
+ {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
+ {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
+ {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
+ {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
+ {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
+ {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
+ {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
+ {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
+ {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
+ {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
+ {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
+ {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
+ {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
+ {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
+ {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
+ {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
+ {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
+ {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
+ {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
+ {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
+ {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
+ {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
+ {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
+ {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
+ {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
+ {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
+ {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
+ {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
+ {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
+ {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
+ {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
+ {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
+ {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
+ {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
+ {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
+ {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
+ {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
+ {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
+ {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
+ {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
+ {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
+ {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
+ {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
+ {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
+ {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
+ {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
+ {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
+ {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
+ {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
+ {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
+ {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
+ {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
+ {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
+ {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
+ {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
+ {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
+ {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
+ {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
+ {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
+ {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
+ {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
+ {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
+ {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
+ {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
+ {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
+ {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
+ {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
+ {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "packaging"
version = "24.1"
@@ -1428,25 +1715,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "pyparsing"
-version = "3.1.4"
-description = "pyparsing module - Classes and methods to define and execute parsing grammars"
-optional = false
-python-versions = ">=3.6.8"
-files = [
- {file = "pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c"},
- {file = "pyparsing-3.1.4.tar.gz", hash = "sha256:f86ec8d1a83f11977c9a6ea7598e8c27fc5cddfa5b07ea2241edbbde1d7bc032"},
-]
-
-[package.extras]
-diagrams = ["jinja2", "railroad-diagrams"]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "pytest"
version = "8.3.3"
@@ -1512,6 +1780,31 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "python-socks"
+version = "2.5.2"
+description = "Core proxy (SOCKS4, SOCKS5, HTTP tunneling) functionality for Python"
+optional = false
+python-versions = "*"
+files = [
+ {file = "python_socks-2.5.2-py3-none-any.whl", hash = "sha256:e2511c0d270d5135f8052d5e7ab7c4f089bd0f3fe0f54b8c322f8cbda5db2b2e"},
+ {file = "python_socks-2.5.2.tar.gz", hash = "sha256:1a5220d159f88a92ef2f77d1acb77d175d40cb34af9176609d3cf728cb7499c7"},
+]
+
+[package.dependencies]
+async-timeout = {version = ">=3.0.1", optional = true, markers = "python_version < \"3.11\" and extra == \"asyncio\""}
+
+[package.extras]
+anyio = ["anyio (>=3.3.4,<5.0.0)"]
+asyncio = ["async-timeout (>=3.0.1)"]
+curio = ["curio (>=1.4)"]
+trio = ["trio (>=0.16.0)"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "pywin32"
version = "306"
@@ -1623,51 +1916,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "requests"
-version = "2.31.0"
-description = "Python HTTP for Humans."
-optional = false
-python-versions = ">=3.7"
-files = [
- {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
- {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
-]
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<4"
-idna = ">=2.5,<4"
-urllib3 = ">=1.21.1,<3"
-
-[package.extras]
-socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
-[[package]]
-name = "requests-toolbelt"
-version = "1.0.0"
-description = "A utility belt for advanced users of python-requests"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-files = [
- {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
- {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
-]
-
-[package.dependencies]
-requests = ">=2.0.1,<3.0.0"
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "setuptools"
version = "75.1.0"
@@ -1993,21 +2241,109 @@ url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
[[package]]
-name = "urllib3"
-version = "2.2.3"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
+name = "yarl"
+version = "1.13.1"
+description = "Yet another URL library"
optional = false
python-versions = ">=3.8"
files = [
- {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
- {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+ {file = "yarl-1.13.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:82e692fb325013a18a5b73a4fed5a1edaa7c58144dc67ad9ef3d604eccd451ad"},
+ {file = "yarl-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df4e82e68f43a07735ae70a2d84c0353e58e20add20ec0af611f32cd5ba43fb4"},
+ {file = "yarl-1.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec9dd328016d8d25702a24ee274932aebf6be9787ed1c28d021945d264235b3c"},
+ {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5820bd4178e6a639b3ef1db8b18500a82ceab6d8b89309e121a6859f56585b05"},
+ {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86c438ce920e089c8c2388c7dcc8ab30dfe13c09b8af3d306bcabb46a053d6f7"},
+ {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3de86547c820e4f4da4606d1c8ab5765dd633189791f15247706a2eeabc783ae"},
+ {file = "yarl-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca53632007c69ddcdefe1e8cbc3920dd88825e618153795b57e6ebcc92e752a"},
+ {file = "yarl-1.13.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4ee1d240b84e2f213565f0ec08caef27a0e657d4c42859809155cf3a29d1735"},
+ {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c49f3e379177f4477f929097f7ed4b0622a586b0aa40c07ac8c0f8e40659a1ac"},
+ {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5c5e32fef09ce101fe14acd0f498232b5710effe13abac14cd95de9c274e689e"},
+ {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab9524e45ee809a083338a749af3b53cc7efec458c3ad084361c1dbf7aaf82a2"},
+ {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:b1481c048fe787f65e34cb06f7d6824376d5d99f1231eae4778bbe5c3831076d"},
+ {file = "yarl-1.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:31497aefd68036d8e31bfbacef915826ca2e741dbb97a8d6c7eac66deda3b606"},
+ {file = "yarl-1.13.1-cp310-cp310-win32.whl", hash = "sha256:1fa56f34b2236f5192cb5fceba7bbb09620e5337e0b6dfe2ea0ddbd19dd5b154"},
+ {file = "yarl-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:1bbb418f46c7f7355084833051701b2301092e4611d9e392360c3ba2e3e69f88"},
+ {file = "yarl-1.13.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:216a6785f296169ed52cd7dcdc2612f82c20f8c9634bf7446327f50398732a51"},
+ {file = "yarl-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40c6e73c03a6befb85b72da213638b8aaa80fe4136ec8691560cf98b11b8ae6e"},
+ {file = "yarl-1.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2430cf996113abe5aee387d39ee19529327205cda975d2b82c0e7e96e5fdabdc"},
+ {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fb4134cc6e005b99fa29dbc86f1ea0a298440ab6b07c6b3ee09232a3b48f495"},
+ {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:309c104ecf67626c033845b860d31594a41343766a46fa58c3309c538a1e22b2"},
+ {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f90575e9fe3aae2c1e686393a9689c724cd00045275407f71771ae5d690ccf38"},
+ {file = "yarl-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d2e1626be8712333a9f71270366f4a132f476ffbe83b689dd6dc0d114796c74"},
+ {file = "yarl-1.13.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b66c87da3c6da8f8e8b648878903ca54589038a0b1e08dde2c86d9cd92d4ac9"},
+ {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cf1ad338620249f8dd6d4b6a91a69d1f265387df3697ad5dc996305cf6c26fb2"},
+ {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9915300fe5a0aa663c01363db37e4ae8e7c15996ebe2c6cce995e7033ff6457f"},
+ {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:703b0f584fcf157ef87816a3c0ff868e8c9f3c370009a8b23b56255885528f10"},
+ {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1d8e3ca29f643dd121f264a7c89f329f0fcb2e4461833f02de6e39fef80f89da"},
+ {file = "yarl-1.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7055bbade838d68af73aea13f8c86588e4bcc00c2235b4b6d6edb0dbd174e246"},
+ {file = "yarl-1.13.1-cp311-cp311-win32.whl", hash = "sha256:a3442c31c11088e462d44a644a454d48110f0588de830921fd201060ff19612a"},
+ {file = "yarl-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:81bad32c8f8b5897c909bf3468bf601f1b855d12f53b6af0271963ee67fff0d2"},
+ {file = "yarl-1.13.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f452cc1436151387d3d50533523291d5f77c6bc7913c116eb985304abdbd9ec9"},
+ {file = "yarl-1.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9cec42a20eae8bebf81e9ce23fb0d0c729fc54cf00643eb251ce7c0215ad49fe"},
+ {file = "yarl-1.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d959fe96e5c2712c1876d69af0507d98f0b0e8d81bee14cfb3f6737470205419"},
+ {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8c837ab90c455f3ea8e68bee143472ee87828bff19ba19776e16ff961425b57"},
+ {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94a993f976cdcb2dc1b855d8b89b792893220db8862d1a619efa7451817c836b"},
+ {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b2442a415a5f4c55ced0fade7b72123210d579f7d950e0b5527fc598866e62c"},
+ {file = "yarl-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fdbf0418489525231723cdb6c79e7738b3cbacbaed2b750cb033e4ea208f220"},
+ {file = "yarl-1.13.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b7f6e699304717fdc265a7e1922561b02a93ceffdaefdc877acaf9b9f3080b8"},
+ {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bcd5bf4132e6a8d3eb54b8d56885f3d3a38ecd7ecae8426ecf7d9673b270de43"},
+ {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2a93a4557f7fc74a38ca5a404abb443a242217b91cd0c4840b1ebedaad8919d4"},
+ {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:22b739f99c7e4787922903f27a892744189482125cc7b95b747f04dd5c83aa9f"},
+ {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2db874dd1d22d4c2c657807562411ffdfabec38ce4c5ce48b4c654be552759dc"},
+ {file = "yarl-1.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4feaaa4742517eaceafcbe74595ed335a494c84634d33961214b278126ec1485"},
+ {file = "yarl-1.13.1-cp312-cp312-win32.whl", hash = "sha256:bbf9c2a589be7414ac4a534d54e4517d03f1cbb142c0041191b729c2fa23f320"},
+ {file = "yarl-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:d07b52c8c450f9366c34aa205754355e933922c79135125541daae6cbf31c799"},
+ {file = "yarl-1.13.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:95c6737f28069153c399d875317f226bbdea939fd48a6349a3b03da6829fb550"},
+ {file = "yarl-1.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cd66152561632ed4b2a9192e7f8e5a1d41e28f58120b4761622e0355f0fe034c"},
+ {file = "yarl-1.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6a2acde25be0cf9be23a8f6cbd31734536a264723fca860af3ae5e89d771cd71"},
+ {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a18595e6a2ee0826bf7dfdee823b6ab55c9b70e8f80f8b77c37e694288f5de1"},
+ {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a31d21089894942f7d9a8df166b495101b7258ff11ae0abec58e32daf8088813"},
+ {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:45f209fb4bbfe8630e3d2e2052535ca5b53d4ce2d2026bed4d0637b0416830da"},
+ {file = "yarl-1.13.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f722f30366474a99745533cc4015b1781ee54b08de73260b2bbe13316079851"},
+ {file = "yarl-1.13.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3bf60444269345d712838bb11cc4eadaf51ff1a364ae39ce87a5ca8ad3bb2c8"},
+ {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:942c80a832a79c3707cca46bd12ab8aa58fddb34b1626d42b05aa8f0bcefc206"},
+ {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:44b07e1690f010c3c01d353b5790ec73b2f59b4eae5b0000593199766b3f7a5c"},
+ {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:396e59b8de7e4d59ff5507fb4322d2329865b909f29a7ed7ca37e63ade7f835c"},
+ {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3bb83a0f12701c0b91112a11148b5217617982e1e466069d0555be9b372f2734"},
+ {file = "yarl-1.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c92b89bffc660f1274779cb6fbb290ec1f90d6dfe14492523a0667f10170de26"},
+ {file = "yarl-1.13.1-cp313-cp313-win32.whl", hash = "sha256:269c201bbc01d2cbba5b86997a1e0f73ba5e2f471cfa6e226bcaa7fd664b598d"},
+ {file = "yarl-1.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:1d0828e17fa701b557c6eaed5edbd9098eb62d8838344486248489ff233998b8"},
+ {file = "yarl-1.13.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8be8cdfe20787e6a5fcbd010f8066227e2bb9058331a4eccddec6c0db2bb85b2"},
+ {file = "yarl-1.13.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:08d7148ff11cb8e886d86dadbfd2e466a76d5dd38c7ea8ebd9b0e07946e76e4b"},
+ {file = "yarl-1.13.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4afdf84610ca44dcffe8b6c22c68f309aff96be55f5ea2fa31c0c225d6b83e23"},
+ {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0d12fe78dcf60efa205e9a63f395b5d343e801cf31e5e1dda0d2c1fb618073d"},
+ {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298c1eecfd3257aa16c0cb0bdffb54411e3e831351cd69e6b0739be16b1bdaa8"},
+ {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c14c16831b565707149c742d87a6203eb5597f4329278446d5c0ae7a1a43928e"},
+ {file = "yarl-1.13.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9bacedbb99685a75ad033fd4de37129449e69808e50e08034034c0bf063f99"},
+ {file = "yarl-1.13.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:658e8449b84b92a4373f99305de042b6bd0d19bf2080c093881e0516557474a5"},
+ {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:373f16f38721c680316a6a00ae21cc178e3a8ef43c0227f88356a24c5193abd6"},
+ {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:45d23c4668d4925688e2ea251b53f36a498e9ea860913ce43b52d9605d3d8177"},
+ {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f7917697bcaa3bc3e83db91aa3a0e448bf5cde43c84b7fc1ae2427d2417c0224"},
+ {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:5989a38ba1281e43e4663931a53fbf356f78a0325251fd6af09dd03b1d676a09"},
+ {file = "yarl-1.13.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:11b3ca8b42a024513adce810385fcabdd682772411d95bbbda3b9ed1a4257644"},
+ {file = "yarl-1.13.1-cp38-cp38-win32.whl", hash = "sha256:dcaef817e13eafa547cdfdc5284fe77970b891f731266545aae08d6cce52161e"},
+ {file = "yarl-1.13.1-cp38-cp38-win_amd64.whl", hash = "sha256:7addd26594e588503bdef03908fc207206adac5bd90b6d4bc3e3cf33a829f57d"},
+ {file = "yarl-1.13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a0ae6637b173d0c40b9c1462e12a7a2000a71a3258fa88756a34c7d38926911c"},
+ {file = "yarl-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:576365c9f7469e1f6124d67b001639b77113cfd05e85ce0310f5f318fd02fe85"},
+ {file = "yarl-1.13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:78f271722423b2d4851cf1f4fa1a1c4833a128d020062721ba35e1a87154a049"},
+ {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d74f3c335cfe9c21ea78988e67f18eb9822f5d31f88b41aec3a1ec5ecd32da5"},
+ {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1891d69a6ba16e89473909665cd355d783a8a31bc84720902c5911dbb6373465"},
+ {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fb382fd7b4377363cc9f13ba7c819c3c78ed97c36a82f16f3f92f108c787cbbf"},
+ {file = "yarl-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c8854b9f80693d20cec797d8e48a848c2fb273eb6f2587b57763ccba3f3bd4b"},
+ {file = "yarl-1.13.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbf2c3f04ff50f16404ce70f822cdc59760e5e2d7965905f0e700270feb2bbfc"},
+ {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fb9f59f3848edf186a76446eb8bcf4c900fe147cb756fbbd730ef43b2e67c6a7"},
+ {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ef9b85fa1bc91c4db24407e7c4da93a5822a73dd4513d67b454ca7064e8dc6a3"},
+ {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:098b870c18f1341786f290b4d699504e18f1cd050ed179af8123fd8232513424"},
+ {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:8c723c91c94a3bc8033dd2696a0f53e5d5f8496186013167bddc3fb5d9df46a3"},
+ {file = "yarl-1.13.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:44a4c40a6f84e4d5955b63462a0e2a988f8982fba245cf885ce3be7618f6aa7d"},
+ {file = "yarl-1.13.1-cp39-cp39-win32.whl", hash = "sha256:84bbcdcf393139f0abc9f642bf03f00cac31010f3034faa03224a9ef0bb74323"},
+ {file = "yarl-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:fc2931ac9ce9c61c9968989ec831d3a5e6fcaaff9474e7cfa8de80b7aff5a093"},
+ {file = "yarl-1.13.1-py3-none-any.whl", hash = "sha256:6a5185ad722ab4dd52d5fb1f30dcc73282eb1ed494906a92d1a228d3f89607b0"},
+ {file = "yarl-1.13.1.tar.gz", hash = "sha256:ec8cfe2295f3e5e44c51f57272afbd69414ae629ec7c6b27f5a410efc78b70a0"},
]
-[package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-h2 = ["h2 (>=4,<5)"]
-socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
[package.source]
type = "legacy"
@@ -2041,4 +2377,4 @@ reference = "mirrors"
[metadata]
lock-version = "2.0"
python-versions = "<3.13,>=3.10"
-content-hash = "056b2f7a21b0286a04a5ecadb809f6472c636348fe07976ac42c9c47c620f04c"
+content-hash = "4f40efe2d34c2dd6b279869363068ee58b82ac0de10b674eaf50acc3160f8527"
diff --git a/pyproject.toml b/pyproject.toml
index a5e1b4d10..c25caa463 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,11 +13,9 @@ format = "v{base}.{distance}"
[tool.poetry.dependencies]
python = "<3.13,>=3.10"
-cloudscraper = "1.2.71"
colorama = "0.4.4"
pillow = "10.2.0"
pretty-errors = "1.2.19"
-requests = "2.31.0"
tqdm = "4.59.0"
# https://stackoverflow.com/questions/446209/possible-values-from-sys-platform
pywin32 = {version = "^306", markers = "sys_platform == 'win32'"}
@@ -29,6 +27,9 @@ confz = "^2.0.1"
pydantic-extra-types = "^2.9.0"
pendulum = "^3.0.0"
slimeface = "^2024.9.27"
+aiofiles = "^24.1.0"
+aiohttp = "^3.10.8"
+aiohttp-socks = "^0.9.0"
[tool.poetry.scripts]
javsp = "javsp.__main__:entry"
diff --git a/tools/config_migration.py b/tools/config_migration.py
index 95adc45d6..f08f9ed67 100644
--- a/tools/config_migration.py
+++ b/tools/config_migration.py
@@ -76,13 +76,16 @@ def fix_pat(p):
# 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080'
# null表示禁用代理
proxy_server: {'null' if proxy_disabled else f"'{cfg['Network']['proxy']}'"}
- # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
- proxy_free:
-{'\n'.join([f" {id}: '{url}'" for id, url in dict(cfg['ProxyFree']).items()])}
# 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了
- retry: {cfg['Network']['retry']}
+ retries: {cfg['Network']['retry']}
# https://en.wikipedia.org/wiki/ISO_8601#Durations
timeout: PT{cfg['Network']['timeout']}S
+ # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
+ unproxied: [{
+ ', '.join(dict(cfg['ProxyFree']).values())
+}]
+ fallback:
+{'\n'.join([f" {id}: ['{url}']" for id, url in dict(cfg['ProxyFree']).items()])}
################################
crawler:
@@ -100,8 +103,6 @@ def fix_pat(p):
hardworking: {yes_to_true(cfg['Crawler']['hardworking_mode'])}
# 使用网页番号作为最终番号(启用时会对番号大小写等进行更正)
respect_site_avid: {yes_to_true(cfg['Crawler']['respect_site_avid'])}
- # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件
- fc2fan_local_path: '{cfg['Crawler']['fc2fan_local_path']}'
# 刮削一部电影后的等待时间(设置为0禁用此功能)
# https://en.wikipedia.org/wiki/ISO_8601#Durations
sleep_after_scraping: PT{cfg['Crawler']['sleep_after_scraping']}S
diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py
index 1537d93ad..7738a7361 100644
--- a/unittest/test_proxyfree.py
+++ b/unittest/test_proxyfree.py
@@ -1,18 +1,29 @@
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from javsp.web.proxyfree import *
+import asyncio
+import tracemalloc
+from javsp.crawlers.proxyfree import get_proxy_free_url
+from javsp.config import CrawlerID
+from javsp.network.client import clear_clients
def test_get_url():
- assert get_proxy_free_url('javlib') != ''
- assert get_proxy_free_url('javdb') != ''
+ async def wrap():
+ assert await get_proxy_free_url(CrawlerID.javlib) != None
+ assert await get_proxy_free_url(CrawlerID.javdb) != None
+ await clear_clients()
+ asyncio.run(wrap())
def test_get_url_with_prefer():
- prefer_url = 'https://www.baidu.com'
- assert prefer_url == get_proxy_free_url('javlib', prefer_url)
+ async def wrap():
+ prefer_url = 'https://www.baidu.com'
+ assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url)
+ await clear_clients()
+ asyncio.run(wrap())
if __name__ == "__main__":
- print(get_proxy_free_url('javlib'))
+ async def aentry():
+ print(await get_proxy_free_url(CrawlerID.javlib))
+ await clear_clients()
+
+ tracemalloc.start()
+ asyncio.run(aentry(), debug=True)