1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import requests
from bs4 import BeautifulSoup
import os
import time
import random
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from ebooklib import epub

# 全局配置
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Referer': 'https://www.po18.tw',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'
}

# 提示用戶輸入自己的 Cookie
cookies = {
    'session_cookie_name': 'YOUR_COOKIE_VALUE_HERE',
    'another_cookie_name': 'YOUR_ANOTHER_COOKIE_VALUE_HERE',
    # 在這裡添加更多的 Cookie,如果有需要的話
    # 示例: 'authtoken1': 'YOUR_AUTHTOKEN_VALUE_HERE'
}

# 清理文件名中的非法字符
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

# 抓取網頁內容
def fetch_page_content(url):
    try:
        response = session.get(url, headers=headers, cookies=cookies, timeout=15)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"抓取頁面時出錯 {url}: {e}")
        return None

# 從頁面中提取章節連結
def get_chapters_from_page(page_content):
    soup = BeautifulSoup(page_content, 'lxml')
    return soup.find_all('a', class_='btn_L_blue')

# 下載章節
def download_chapter(chapter_url, chapter_number):
    retries = 5
    while retries > 0:
        try:
            text_url = chapter_url.replace("articles", "articlescontent")
            headers['Referer'] = chapter_url
            response = session.get(text_url, headers=headers, cookies=cookies, timeout=30)
            response.raise_for_status()

            chapter = response.text.replace("&nbsp;&nbsp;", '')
            soup = BeautifulSoup(chapter, 'lxml')
            chapter_title = soup.find('h1').get_text()

            book_folder = f'{book_name}_chapters'
            if not os.path.exists(book_folder):
                os.makedirs(book_folder)

            formatted_title = f"{chapter_number:03d}_{sanitize_filename(chapter_title)}"
            filename = os.path.join(book_folder, f'{formatted_title}.txt')

            if os.path.exists(filename):
                print(f'{formatted_title} 已經存在。跳過下載。')
                return True  # 下載成功

            with open(filename, 'w', encoding='utf-8') as txt:
                print(f'{formatted_title} 處理中...')
                txt.write(f'{formatted_title}\n')
                text = soup.find_all('p')
                if not text:
                    print(f"在第 {chapter_number} 章節未找到文本。")
                for row in text:
                    txt.write(row.get_text() + '\n')
                print(f'{formatted_title} 完成。')
            return True  # 下載成功
        except requests.exceptions.RequestException as e:
            print(f"處理章節時出錯 {chapter_url}: {e}")
            retries -= 1
            if retries == 0:
                return False  # 重試後失敗
            time.sleep(2)  # 短暫等待後重試
    return False  # 重試後仍然失敗

# 遍歷所有章節,並下載
def getContent(page, chapter_number):
    content_url = f'https://www.po18.tw/books/{book_number}/articles?page={page}'

    page_content = fetch_page_content(content_url)
    if not page_content:
        return

    chapters = get_chapters_from_page(page_content)
    if not chapters:
        print(f"在第 {page} 頁面未找到章節。")
        return

    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = []
        for i in range(len(chapters)):
            if chapter_number > chapter_sum:
                print("所有章節已經處理完成。")
                return

            chapter_url = 'https://www.po18.tw' + chapters[i].get('href')
            futures.append(executor.submit(download_chapter, chapter_url, chapter_number))
            chapter_number += 1
            time.sleep(random.uniform(1, 2))  # 在提交之間延遲

        for future in as_completed(futures):
            future.result()

    if chapter_number <= chapter_sum:
        page += 1
        getContent(page, chapter_number)

# 重試失敗的章節
def retry_failed_chapters():
    global failed_chapters
    if failed_chapters:
        print("重試失敗的章節...")
        retries = 5
        while failed_chapters and retries > 0:
            current_failed_chapters = failed_chapters.copy()
            failed_chapters = []
            with ThreadPoolExecutor(max_workers=20) as executor:
                futures = [executor.submit(download_chapter, url, num) for url, num in current_failed_chapters]
                for future in as_completed(futures):
                    if not future.result():
                        failed_chapters.append((url, num))  # 重新添加失敗的章節
            retries -= 1
            if failed_chapters:
                print(f"剩餘重試次數: {retries}. 重試失敗的章節...")
            time.sleep(2)  # 短暫等待後重試

# 生成 EPUB 文件
def create_epub():
    book_folder = f'{book_name}_chapters'
    if not os.path.exists(book_folder):
        print(f"文件夾 {book_folder} 不存在。請先下載章節。")
        return

    book = epub.EpubBook()
    book.set_identifier(book_number)
    book.set_title(book_name)
    book.set_language('zh')

    # 添加目錄頁面
    toc = []
    spine = ['nav']

    for filename in sorted(os.listdir(book_folder)):
        if filename.endswith('.txt'):
            chapter_title = filename.split('_', 1)[1].replace('.txt', '')
            with open(os.path.join(book_folder, filename), 'r', encoding='utf-8') as file:
                content = file.read()

            chapter = epub.EpubHtml(title=chapter_title, file_name=filename.replace('.txt', '.xhtml'))
            chapter.content = f"<h1>{chapter_title}</h1><p>{content.replace('\n', '</p><p>')}</p>"

            book.add_item(chapter)
            toc.append(chapter)
            spine.append(chapter)

    # 定義目錄
    book.toc = toc
    book.spine = spine

    # 添加導航文件
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # 設置 CSS
    style = '''
    body { font-family: Times, serif; }
    h1 { text-align: center; margin-bottom: 20px; }
    p { text-indent: 1em; margin-top: 0.5em; margin-bottom: 0.5em; }
    '''
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(nav_css)

    # 生成 EPUB 文件
    epub_name = sanitize_filename(book_name) + '.epub'
    epub.write_epub(epub_name, book, {})

    print(f'EPUB 文件 "{epub_name}" 已生成。')

# 主程序
book_number = '778402'
book_name = '快穿之女配勢要撲倒男主'
chapter_sum = 785

# 從第一頁開始抓取章節 = 1
start_chapter = 1
session = requests.session()

failed_chapters = []

# 開始下載章節
getContent(1, start_chapter)

# 嘗試處理失敗的章節
retry_failed_chapters()

# 生成 EPUB 文件
create_epub()