Skip to content

Commit

Permalink
refactor: separate LinovelibSpider to PC and Mobile
Browse files Browse the repository at this point in the history
  • Loading branch information
wdpm committed Oct 29, 2024
1 parent 6f77c34 commit 90b6909
Show file tree
Hide file tree
Showing 7 changed files with 707 additions and 684 deletions.
23 changes: 3 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Crawl light novel from some websites and convert it to epub.

| 指标分类 | 指标集 |
|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Software Version | [![Python Version](https://img.shields.io/badge/python>=3.10-blue)]() [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg?style=flat)](https://github.com/pypa/hatch) ![PyPI](https://img.shields.io/pypi/v/linovelib2epub) |
| Software Version | [![Python Version](https://img.shields.io/badge/python>=3.10-blue)]() [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg?style=flat)](https://github.com/pypa/hatch) |
| Code Style | [![flake8](https://img.shields.io/badge/linter-flake8-brightgreen)](https://github.com/PyCQA/flake8) |
| Code Statistics | ![Lines of code](https://www.aschey.tech/tokei/github/lightnovel-center/linovelib2epub) ![PyPI - Downloads](https://img.shields.io/pypi/dm/linovelib2epub?color=blue&label=PyPI%20Download) |
| Code Activity | [![Hits-of-Code](https://hitsofcode.com/github/lightnovel-center/linovelib2epub?branch=main)](https://hitsofcode.com/github/lightnovel-center/linovelib2epub/view?branch=main) ![GitHub commit activity](https://img.shields.io/github/commit-activity/y/lightnovel-center/linovelib2epub) |
Expand Down Expand Up @@ -99,22 +99,6 @@ py -m pip install -r requirements.txt
python -m pip install -e .
```

### ~~install from pypi~~

> 注意: 由于爬虫程序对时效非常敏感,而 pypi 发布形式目前不再更新,不要使用这种安装方式。
1. Install this package from pypi:

```
pip install linovelib2epub
```

2. update to the latest version:

```
pip install linovelib2epub --upgrade
```

## Some issues you might encounter during installation

> Microsoft Visual C++ 14.0 or greater is required
Expand Down Expand Up @@ -390,12 +374,11 @@ Don't need login, no threshold.
|--------------|--------|----------|---------|--------------------------------------------------|
| http_timeout | number | NO | 10 | 一个 HTTP 请求的超时等待时间 (秒)。代表 connect 和 read timeout。 |
| http_retries | number | NO | 10 | 当一个 HTTP 请求失败后,重试的最大次数。 |
| http_cookie | string | NO | '' | 自定义 HTTP cookie。 |

## Todo

- [ ] feat: add GOT-OCR2.0 engine alternative for linovelib site
- [ ] feat: add epubcheck for output files. see https://epubcheck.readthedocs.io/en/latest/readme.html#using-epubcheck-as-a-python-library
- [ ] feat: add GOT-OCR2.0 engine alternative for linovelib site, support disable ocr(keep encrypted text.)
- [ ] feat: [option]add epubcheck for output files. see https://epubcheck.readthedocs.io/en/latest/readme.html#using-epubcheck-as-a-python-library
- [ ] quality: setup pytest and codecov
- [ ] quality: setup more formatter and linter for maintainability
- [ ] masiro 繁体 <=> 简体
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dynaconf==3.2.3
EbookLib==0.17.1
esprima==4.0.1
fake-useragent==1.1.1
importlib-resources==6.4.5
# importlib-resources==6.4.5
inquirer==3.1.2
lxml==5.3.0
pillow==11.0.0
Expand Down
44 changes: 28 additions & 16 deletions src/linovelib2epub/linovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .exceptions import LinovelibException
from .logger import Logger
from .models import LightNovel, LightNovelVolume, LightNovelImage
from .spider import ASYNCIO, LinovelibSpider # type: ignore[attr-defined]
from .spider import ASYNCIO, LinovelibSpiderMobile, LinovelibSpiderPC # type: ignore[attr-defined]
from .spider.masiro_spider import MasiroSpider
from .spider.wenku8_spider import Wenku8Spider
from .utils import (create_folder_if_not_exists, random_useragent,
Expand Down Expand Up @@ -90,7 +90,15 @@ def _write_epub(self,
if cover_filename is None:
cover_filename = 'cover'
_cover_file = cover_filename + '.' + cover_type
book.set_cover(_cover_file, open(cover_file, 'rb').read())
self.logger.debug(f'Cover file: {_cover_file}')
try:
book.set_cover(_cover_file, open(cover_file, 'rb').read())
except:
# MUST set cover file to avoid ebooklib error
width, height = 400, 569
cover_image_fallback = Image.new("RGB", (width, height), "gray")
book.set_cover(_cover_file, cover_image_fallback.tobytes())
self.logger.warning("Cover file is not found or can't be opened. => use empty cover.")

book.spine = ["nav", ]

Expand Down Expand Up @@ -184,9 +192,10 @@ def _write_volume(book: EpubBook,

# COVER STYLE
cover_html = book.get_item_with_id('cover')
self._set_default_cover_style(book, cover_html)
if self.epub_settings["custom_style_cover"]:
self._set_custom_cover_style(book, cover_html)
if cover_html:
self._set_default_cover_style(book, cover_html)
if self.epub_settings["custom_style_cover"]:
self._set_custom_cover_style(book, cover_html)

# NAV STYLE
nav_html = book.get_item_with_id('nav')
Expand Down Expand Up @@ -230,7 +239,7 @@ def _add_image(images_folder: str, illustration: LightNovelImage) -> None:
except (Exception,):
return

# why should we convert all images to jpeg format? => unify to JPEG => get better epub reader support
# unify to JPEG => get better epub reader support
b = io.BytesIO()
img = img.convert('RGB')
img.save(b, 'jpeg')
Expand Down Expand Up @@ -264,7 +273,7 @@ def _get_custom_chapter_style(self) -> EpubItem | None:

@staticmethod
def _get_default_chapter_style() -> EpubItem:
style_chapter = read_pkg_resource('./styles/chapter.css')
style_chapter = read_pkg_resource('styles', 'chapter.css')
default_style_chapter = epub.EpubItem(uid="style_chapter", file_name="styles/chapter.css",
media_type="text/css", content=style_chapter)
return default_style_chapter
Expand All @@ -278,7 +287,7 @@ def _set_custom_cover_style(self, book: EpubBook, cover_html: EpubHtml) -> None:

@staticmethod
def _set_default_cover_style(book: EpubBook, cover_html: EpubHtml) -> None:
default_style_cover_content = read_pkg_resource('./styles/cover.css')
default_style_cover_content = read_pkg_resource('styles', 'cover.css')
default_style_cover = epub.EpubItem(uid="style_cover", file_name="styles/cover.css", media_type="text/css",
content=default_style_cover_content)
cover_html.add_item(default_style_cover)
Expand All @@ -292,7 +301,7 @@ def _set_custom_nav_style(self, book: EpubBook, nav_html: EpubHtml) -> None:

@staticmethod
def _set_default_nav_style(book: EpubBook, nav_html: EpubHtml) -> None:
default_style_nav_content = read_pkg_resource('./styles/nav.css')
default_style_nav_content = read_pkg_resource('styles', 'nav.css')
default_style_nav = epub.EpubItem(uid="style_nav", file_name="styles/nav.css",
media_type="text/css", content=default_style_nav_content)
nav_html.add_item(default_style_nav)
Expand Down Expand Up @@ -336,20 +345,19 @@ def __init__(self,
browser_driver_path: str | None = None,
chapter_crawl_delay: int | None = 3,
page_crawl_delay: int | None = 2,
headless: bool = False
headless: bool = False,
image_download_max_epochs: int | None = None
):
if book_id is None:
raise LinovelibException('book_id parameter must be set.')

self.target_site = target_site

site_to_base_url = {
# https://www.bilinovel.com => https://tw.linovelib.com/ or no changed?
TargetSite.LINOVELIB_MOBILE: 'https://www.bilinovel.com',
TargetSite.LINOVELIB_MOBILE_TRADITIONAL: 'https://www.bilinovel.com',

TargetSite.LINOVELIB_PC: 'https://www.linovelib.com',
# 翻译版本位置:主页底部【简体化】按钮
TargetSite.LINOVELIB_PC_TRADITIONAL: 'https://www.linovelib.com',

TargetSite.MASIRO: 'https://masiro.me',
Expand Down Expand Up @@ -404,11 +412,15 @@ def __init__(self,
'page_crawl_delay': page_crawl_delay,
'headless': headless,
}

if image_download_max_epochs is not None:
self.spider_settings.update({'image_download_max_epochs': image_download_max_epochs})

site_to_spider = {
TargetSite.LINOVELIB_MOBILE: LinovelibSpider,
TargetSite.LINOVELIB_MOBILE_TRADITIONAL: LinovelibSpider,
TargetSite.LINOVELIB_PC: LinovelibSpider,
TargetSite.LINOVELIB_PC_TRADITIONAL: LinovelibSpider,
TargetSite.LINOVELIB_MOBILE: LinovelibSpiderMobile,
TargetSite.LINOVELIB_MOBILE_TRADITIONAL: LinovelibSpiderMobile,
TargetSite.LINOVELIB_PC: LinovelibSpiderPC,
TargetSite.LINOVELIB_PC_TRADITIONAL: LinovelibSpiderPC,
TargetSite.MASIRO: MasiroSpider,
TargetSite.WENKU8: Wenku8Spider,
}
Expand Down
5 changes: 3 additions & 2 deletions src/linovelib2epub/spider/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from .base_spider import (ASYNCIO, MULTIPROCESSING, MULTITHREADING,
BaseNovelWebsiteSpider)
from .linovelib_spider import LinovelibSpider
from .linovelib_spider import LinovelibSpiderMobile,LinovelibSpiderPC

# explicit exports
__all__ = [
BaseNovelWebsiteSpider,
LinovelibSpider,
LinovelibSpiderMobile,
LinovelibSpiderPC,
MULTIPROCESSING,
MULTITHREADING,
ASYNCIO
Expand Down
32 changes: 24 additions & 8 deletions src/linovelib2epub/spider/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from abc import ABC, abstractmethod
from multiprocessing import Pool
from pathlib import Path
from typing import Iterable, Optional, Callable, Awaitable, Union, Dict, Any, List
from typing import Iterable, Optional, Callable, Awaitable, Union, Dict, Any, List, SupportsBytes

import aiofiles
import aiohttp as aiohttp
Expand Down Expand Up @@ -37,7 +37,9 @@ def __init__(self, spider_settings: Dict[str, Any]) -> None:
log_filename=self.spider_settings["log_filename"]).get_logger()

# in base class, http session is bare
self.session = requests.session()
self._session = requests.session()

self._image_download_max_epochs = self.spider_settings.get('image_download_max_epochs', 10)

self.FETCH_CHAPTER_CONCURRENCY_LEVEL = 2

Expand All @@ -47,7 +49,7 @@ def fetch(self) -> LightNovel:

def request_headers(self) -> Dict[str, Any]:
"""
Act as a common headers, 这个方法目前在base class中的用例为:下载图片时的默认请求头。
Act as a common headers, 这个方法目前在 base class 中的用例为:下载图片时的默认请求头。
:return:
"""
return {}
Expand Down Expand Up @@ -89,8 +91,8 @@ def _download_image_legacy(self, download_url: str, local_relative_path: str) ->

# url is valid and never downloaded
try:
resp = self.session.get(download_url, headers=self.request_headers(),
timeout=self.spider_settings['http_timeout'], verify=False)
resp = self._session.get(download_url, headers=self.request_headers(),
timeout=self.spider_settings['http_timeout'], verify=False)

expected_length = resp.headers and resp.headers.get('Content-Length')
actual_length = resp.raw.tell()
Expand Down Expand Up @@ -124,16 +126,23 @@ async def download_images_by_asyncio(self, light_novel_images: List[LightNovelIm
name=image.download_url)
for image in light_novel_images}
pending: set = tasks
self.logger.info(f'Pending task count: {len(pending)}')
succeed_count = 0

while pending:
# 定义一个变量,表示图片下次的轮数
epoch = 0
# 定义一个变量,表示图片下载最大的尝试轮数
max_epochs = self._image_download_max_epochs

while pending and epoch < max_epochs:
done, pending = await asyncio.wait(pending, return_when=asyncio.ALL_COMPLETED)
# Note: This does not raise TimeoutError! Futures that aren't done when the timeout occurs
# are returned in the second set

# 1. succeed => normal result in done(# HAPPY CASE)
# 2. Timeout => No TimeoutError, put timeout tasks in pending(SAD CASE(need retry))
# 3 Other Exception before timeout => (SAD CASE(need retry)
epoch += 1

for done_task in done:
exception = done_task.exception()
Expand All @@ -152,7 +161,13 @@ async def download_images_by_asyncio(self, light_novel_images: List[LightNovelIm
name=task_url))

self.logger.info(f'SUCCEED_COUNT: {succeed_count}')
self.logger.info(f'[NEXT TURN]Pending task count: {len(pending)}')
self.logger.info(f'Image download epochs: {epoch}; [NEXT TURN]Pending task count: {len(pending)}')

if pending:
self.logger.info(f'Try to cancel all pending tasks...')
for task in pending:
self.logger.info(f'Cancelling image download task of {task.get_name()}')
task.cancel()

async def _download_image(self, session: ClientSession, download_url: str, local_relative_path: str) -> None:
if not is_valid_image_url(download_url):
Expand All @@ -168,6 +183,7 @@ async def _download_image(self, session: ClientSession, download_url: str, local

timeout = aiohttp.ClientTimeout(total=30, connect=15) # per request timeout
async with session.get(download_url, headers=self.request_headers(), timeout=timeout) as resp:
# strict mode is 200 OK
if resp.status < 400:
image = await resp.read()

Expand Down Expand Up @@ -313,7 +329,7 @@ async def fetch_chapters(self, session: Any, catalog_list: List[CatalogBaseVolum
page_url_set = {chapter.chapter_url for volume in catalog_list for chapter in volume.chapters}
url_to_page = await self.download_pages(session, page_url_set)

# TODO 下面这部分代码提取到单独的func,不涉及网络请求,只是HTML解构解析
# TODO 下面这部分代码提取到单独的 func,不涉及网络请求,只是 HTML 解构解析

# Main goals:
# 1. extract body and update dict
Expand Down
Loading

0 comments on commit 90b6909

Please sign in to comment.