1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""
PDF 關鍵字擷取 (繁體中文, Tkinter)
版本:自由文字選取 + 一鍵複製版
功能:
- Windows7 / Windows11 可執行 (需安裝 Tesseract 與 Poppler)
- 將 PDF 轉圖片後以 Tesseract (chi_tra) OCR
- 使用者可新增/刪除關鍵字(不寫檔)
- 結果改用 Text 顯示,可自由拖曳選取部分文字
- 可用 Ctrl+C 或按「複製選取文字」按鈕複製內容
"""

import os
import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog
from PIL import Image
from pdf2image import convert_from_path
import subprocess
import tempfile
import shutil
import threading

# ------------------ 使用前請調整這三個路徑 ------------------
TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
TESSDATA_ROOT = r"C:\Program Files\Tesseract-OCR"
POPPLER_PATH = r"" # 例如 r"C:\poppler-xx\Library\bin"
# ----------------------------------------------------------

# 預設關鍵字
keywords = ["UR", "糞便培養", "糞", "潛血", "毒品"]

class KeywordFinderApp:
def __init__(self, master):
self.master = master
master.title("PDF 關鍵字擷取工具 (繁體中文,自由選取+一鍵複製)")
master.geometry("820x620")

self.current_pdf_path = None

# 上方:檔案操作
frame_top = tk.Frame(master, padx=8, pady=8)
frame_top.pack(fill='x')

self.btn_open = tk.Button(frame_top, text="① 開啟 PDF 檔案", command=self.open_pdf, width=18)
self.btn_open.pack(side='left', padx=6)

self.btn_search = tk.Button(frame_top, text="② 開始搜尋關鍵字", command=self.start_search_thread, width=18, bg='#ffdddd')
self.btn_search.pack(side='left', padx=6)

self.label_pdf = tk.Label(frame_top, text="目前未開啟檔案", anchor='w')
self.label_pdf.pack(side='left', padx=12)

# 關鍵字管理
frame_kw = tk.Frame(master, padx=8, pady=6)
frame_kw.pack(fill='x')

tk.Label(frame_kw, text="當前關鍵字:").pack(side='left')
self.var_kw = tk.StringVar(value=", ".join(keywords))
self.lbl_kw = tk.Label(frame_kw, textvariable=self.var_kw, fg='blue')
self.lbl_kw.pack(side='left', padx=6)

tk.Button(frame_kw, text="新增關鍵字", command=self.add_keyword, width=12).pack(side='right', padx=6)
tk.Button(frame_kw, text="刪除關鍵字", command=self.delete_keyword, width=12).pack(side='right', padx=6)

# 中下方:結果顯示 + 一鍵複製
frame_results = tk.Frame(master, padx=8, pady=6)
frame_results.pack(fill='both', expand=True)

tk.Label(frame_results, text="搜尋結果(可自由拖曳選取文字或使用Ctrl+C)").pack(anchor='w')

frame_text = tk.Frame(frame_results)
frame_text.pack(fill='both', expand=True)

scrollbar = tk.Scrollbar(frame_text)
scrollbar.pack(side='right', fill='y')

# ✅ 改用 Text 元件顯示結果
self.textbox = tk.Text(frame_text, wrap='word', yscrollcommand=scrollbar.set)
self.textbox.pack(fill='both', expand=True)
scrollbar.config(command=self.textbox.yview)
self.textbox.config(state='disabled')

# ✅ 新增「複製選取文字」按鈕
tk.Button(frame_results, text="複製選取文字", command=self.copy_selected_text, width=18, bg='#ddf').pack(anchor='e', pady=(4, 0))

# 狀態列
self.status_var = tk.StringVar(value="準備就緒")
tk.Label(master, textvariable=self.status_var, anchor='w').pack(fill='x', padx=8, pady=(0,8))

# ------------------ 關鍵字管理 ------------------
def update_kw_label(self):
self.var_kw.set(", ".join(keywords))

def add_keyword(self):
s = simpledialog.askstring("新增關鍵字", "請輸入關鍵字:", parent=self.master)
if s:
s = s.strip()
if s and s not in keywords:
keywords.append(s)
self.update_kw_label()

def delete_keyword(self):
s = simpledialog.askstring("刪除關鍵字", "請輸入要刪除的關鍵字:", parent=self.master)
if s:
s = s.strip()
if s in keywords:
keywords.remove(s)
self.update_kw_label()
else:
messagebox.showinfo("提示", "關鍵字不在目前列表中。")

# ------------------ 開啟 PDF ------------------
def open_pdf(self):
path = filedialog.askopenfilename(title="選擇 PDF 檔案", filetypes=[("PDF 檔案", "*.pdf")])
if path:
self.current_pdf_path = path
self.label_pdf.config(text=f"目前檔案:{os.path.basename(path)}")
self._set_text(f"已載入檔案:{os.path.basename(path)}。按「開始搜尋關鍵字」。")
self.status_var.set("已載入檔案")
else:
self.status_var.set("未選擇檔案")

# ------------------ 背景執行搜尋 ------------------
def start_search_thread(self):
if not self.current_pdf_path:
messagebox.showwarning("警告", "請先開啟一個 PDF 檔案。")
return
self.btn_open.config(state='disabled')
self.btn_search.config(state='disabled')
self._set_text("開始處理...(請稍候)")
self.status_var.set("處理中")
threading.Thread(target=self.start_search, daemon=True).start()

def start_search(self):
if not os.path.isfile(TESSERACT_PATH):
self._show_error_and_finish(f"找不到 tesseract:{TESSERACT_PATH}")
return

tessdata_guess = TESSDATA_ROOT
if os.path.isdir(os.path.join(TESSDATA_ROOT, "tessdata")):
tessdata_guess = os.path.join(TESSDATA_ROOT, "tessdata")
os.environ["TESSDATA_PREFIX"] = tessdata_guess

chi_path = os.path.join(tessdata_guess, "chi_tra.traineddata")
if not os.path.isfile(chi_path):
self._show_error_and_finish(f"找不到 chi_tra.traineddata:{chi_path}")
return

temp_dir = tempfile.mkdtemp(prefix="pdf2img_")
found_count = 0

try:
convert_kwargs = {}
if POPPLER_PATH:
convert_kwargs['poppler_path'] = POPPLER_PATH

pages = convert_from_path(self.current_pdf_path, **convert_kwargs)
self._set_text("") # 清空顯示

for idx, page in enumerate(pages):
self._append_text(f"處理第 {idx+1} 頁...\n")
img_path = os.path.join(temp_dir, f"page_{idx}.png")
page.save(img_path, "PNG")

cmd = [
TESSERACT_PATH,
img_path,
"stdout",
"-l", "chi_tra",
"--psm", "6",
"--tessdata-dir", tessdata_guess
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8', errors='replace', check=True)
text = proc.stdout or ""
except Exception as e:
self._append_text(f"[錯誤] 第 {idx+1} 頁 OCR 失敗:{e}\n")
continue

cleaned = "\n".join([s.strip() for s in text.splitlines() if s.strip()])
for line in cleaned.splitlines():
for kw in keywords:
if kw.lower() in line.lower():
result = f"[第{idx+1}頁/關鍵字:{kw}] {line}"
self._append_text(result + "\n")
found_count += 1
break

if found_count == 0:
self._append_text("\n===== 搜尋完成:未找到任何關鍵字 =====\n")
else:
self._append_text(f"\n===== 搜尋完成:共找到 {found_count} 筆結果 =====\n")
self.status_var.set("完成")
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
self._finish_buttons()

# ------------------ 一鍵複製功能 ------------------
def copy_selected_text(self):
try:
selected_text = self.textbox.selection_get()
self.master.clipboard_clear()
self.master.clipboard_append(selected_text)
self.master.update()
messagebox.showinfo("已複製", "選取文字已複製到剪貼簿。")
except tk.TclError:
messagebox.showwarning("提示", "請先選取要複製的文字。")

# ------------------ 輔助函式 ------------------
def _set_text(self, content):
self.textbox.config(state='normal')
self.textbox.delete(1.0, tk.END)
self.textbox.insert(tk.END, content + "\n")
self.textbox.config(state='disabled')
self.textbox.see(tk.END)
self.master.update()

def _append_text(self, content):
self.textbox.config(state='normal')
self.textbox.insert(tk.END, content)
self.textbox.config(state='disabled')
self.textbox.see(tk.END)
self.master.update()

def _finish_buttons(self):
self.btn_open.config(state='normal')
self.btn_search.config(state='normal')

def _show_error_and_finish(self, msg):
messagebox.showerror("錯誤", msg)
self.status_var.set("錯誤")
self._finish_buttons()

if __name__ == "__main__":
root = tk.Tk()
app = KeywordFinderApp(root)
root.mainloop()