1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | import requests from bs4 import BeautifulSoup import os import time import random import re from concurrent.futures import ThreadPoolExecutor, as_completed from ebooklib import epub # 全局配置 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://www.po18.tw', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36' } # 提示用戶輸入自己的 Cookie cookies = { 'session_cookie_name': 'YOUR_COOKIE_VALUE_HERE', 'another_cookie_name': 'YOUR_ANOTHER_COOKIE_VALUE_HERE', # 在這裡添加更多的 Cookie,如果有需要的話 # 示例: 'authtoken1': 'YOUR_AUTHTOKEN_VALUE_HERE' } # 清理文件名中的非法字符 def sanitize_filename(filename): return re.sub(r'[\\/*?:"<>|]', '_', filename) # 抓取網頁內容 def fetch_page_content(url): try: response = session.get(url, headers=headers, cookies=cookies, timeout=15) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"抓取頁面時出錯 {url}: {e}") return None # 從頁面中提取章節連結 def get_chapters_from_page(page_content): soup = BeautifulSoup(page_content, 'lxml') return soup.find_all('a', class_='btn_L_blue') # 下載章節 def download_chapter(chapter_url, chapter_number): retries = 5 while retries > 0: try: text_url = chapter_url.replace("articles", "articlescontent") headers['Referer'] = chapter_url response = session.get(text_url, headers=headers, cookies=cookies, timeout=30) response.raise_for_status() chapter = response.text.replace(" ", '') soup = BeautifulSoup(chapter, 'lxml') chapter_title = soup.find('h1').get_text() book_folder = f'{book_name}_chapters' if not os.path.exists(book_folder): os.makedirs(book_folder) formatted_title = f"{chapter_number:03d}_{sanitize_filename(chapter_title)}" filename = os.path.join(book_folder, f'{formatted_title}.txt') if os.path.exists(filename): print(f'{formatted_title} 已經存在。跳過下載。') return True # 下載成功 with open(filename, 'w', encoding='utf-8') as txt: print(f'{formatted_title} 處理中...') txt.write(f'{formatted_title}\n') text = soup.find_all('p') if not text: print(f"在第 {chapter_number} 章節未找到文本。") for row in text: txt.write(row.get_text() + '\n') print(f'{formatted_title} 完成。') return True # 下載成功 except requests.exceptions.RequestException as e: print(f"處理章節時出錯 {chapter_url}: {e}") retries -= 1 if retries == 0: return False # 重試後失敗 time.sleep(2) # 短暫等待後重試 return False # 重試後仍然失敗 # 遍歷所有章節,並下載 def getContent(page, chapter_number): content_url = f'https://www.po18.tw/books/{book_number}/articles?page={page}' page_content = fetch_page_content(content_url) if not page_content: return chapters = get_chapters_from_page(page_content) if not chapters: print(f"在第 {page} 頁面未找到章節。") return with ThreadPoolExecutor(max_workers=20) as executor: futures = [] for i in range(len(chapters)): if chapter_number > chapter_sum: print("所有章節已經處理完成。") return chapter_url = 'https://www.po18.tw' + chapters[i].get('href') futures.append(executor.submit(download_chapter, chapter_url, chapter_number)) chapter_number += 1 time.sleep(random.uniform(1, 2)) # 在提交之間延遲 for future in as_completed(futures): future.result() if chapter_number <= chapter_sum: page += 1 getContent(page, chapter_number) # 重試失敗的章節 def retry_failed_chapters(): global failed_chapters if failed_chapters: print("重試失敗的章節...") retries = 5 while failed_chapters and retries > 0: current_failed_chapters = failed_chapters.copy() failed_chapters = [] with ThreadPoolExecutor(max_workers=20) as executor: futures = [executor.submit(download_chapter, url, num) for url, num in current_failed_chapters] for future in as_completed(futures): if not future.result(): failed_chapters.append((url, num)) # 重新添加失敗的章節 retries -= 1 if failed_chapters: print(f"剩餘重試次數: {retries}. 重試失敗的章節...") time.sleep(2) # 短暫等待後重試 # 生成 EPUB 文件 def create_epub(): book_folder = f'{book_name}_chapters' if not os.path.exists(book_folder): print(f"文件夾 {book_folder} 不存在。請先下載章節。") return book = epub.EpubBook() book.set_identifier(book_number) book.set_title(book_name) book.set_language('zh') # 添加目錄頁面 toc = [] spine = ['nav'] for filename in sorted(os.listdir(book_folder)): if filename.endswith('.txt'): chapter_title = filename.split('_', 1)[1].replace('.txt', '') with open(os.path.join(book_folder, filename), 'r', encoding='utf-8') as file: content = file.read() chapter = epub.EpubHtml(title=chapter_title, file_name=filename.replace('.txt', '.xhtml')) chapter.content = f"<h1>{chapter_title}</h1><p>{content.replace('\n', '</p><p>')}</p>" book.add_item(chapter) toc.append(chapter) spine.append(chapter) # 定義目錄 book.toc = toc book.spine = spine # 添加導航文件 book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # 設置 CSS style = ''' body { font-family: Times, serif; } h1 { text-align: center; margin-bottom: 20px; } p { text-indent: 1em; margin-top: 0.5em; margin-bottom: 0.5em; } ''' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) # 生成 EPUB 文件 epub_name = sanitize_filename(book_name) + '.epub' epub.write_epub(epub_name, book, {}) print(f'EPUB 文件 "{epub_name}" 已生成。') # 主程序 book_number = '778402' book_name = '快穿之女配勢要撲倒男主' chapter_sum = 785 # 從第一頁開始抓取章節 = 1 start_chapter = 1 session = requests.session() failed_chapters = [] # 開始下載章節 getContent(1, start_chapter) # 嘗試處理失敗的章節 retry_failed_chapters() # 生成 EPUB 文件 create_epub() |
Direct link: https://paste.plurk.com/show/6wXIE80PnKe0MKPa7mDe