withopen(file, mode='r+', encoding='utf-8') as f: t = f.readlines() for line,text inenumerate(t): if text.find(keyword) != -1: print(f'{line}{text}')
PDF
1 2 3 4 5 6 7
import PyPDF2 pdf = open(file, "rb") pdf_reader = PyPDF2.PdfReader(pdf) for page_num inrange(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] if page.extract_text().find(keyword) != -1: tmp.update({page_num:'something'})
Word
需要安装 python-docx 库,只能读取.docx 格式,.doc 不行。
1 2 3 4 5 6 7 8 9 10
import docx doc = docx.Document(file) for paragraph in doc.paragraphs: if paragraph.text.find(keyword) != -1: tmp.update({paragraph:paragraph.text}) for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.find(keyword) != -1: tmp.update({f'{cell}{row}':cell.text})
Excel
需要安装 openpyxl 库,只能读取.xlsx 格式,.xls 不行。
1 2 3 4 5 6 7
import openpyxl wb = openpyxl.load_workbook(file) for ws in wb: for row in ws.values: for value in row: if value.find(keyword) != -1: tmp.update({row:value})
PowerPoint
需要安装 python-pptx 库,只能读取.pptx 格式,.ppt 不行。
1 2 3 4 5 6 7 8 9 10 11
import pptx prs = pptx.Presentation(file) text_runs = [] for slide in prs.slides: for shape in slide.shapes: ifnot shape.has_text_frame: continue for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: if run.text.find(keyword) != -1: tmp.update({run:run.text})
# 遍历 # result = {file1:{line1:text1,line2:text2...},...} for file in files: # 跳过文件夹 ifnot file.is_file(): continue # 跳过大文件 if file.stat().st_size > limit: continue # 跳过大文件 if file.suffix in suffix: continue
# 报错捕捉 try: tmp = {} match file.suffix: case'.pdf': pdf = open(file, "rb") pdf_reader = PyPDF2.PdfReader(pdf) for page_num inrange(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] if page.extract_text().find(keyword) != -1: tmp.update({f'page {page_num+1}':keyword}) case'.xlsx': wb = openpyxl.load_workbook(file) for ws in wb: for row in ws.values: for value in row: if value.find(keyword) != -1: tmp.update({f'row {row}':value}) case'.docx': doc = docx.Document(file) for paragraph in doc.paragraphs: if paragraph.text.find(keyword) != -1: tmp.update({paragraph:paragraph.text}) for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.find(keyword) != -1: tmp.update({f'cell {cell} row {row}':cell.text}) case'.pptx': prs = pptx.Presentation(file) text_runs = [] for slide in prs.slides: for shape in slide.shapes: ifnot shape.has_text_frame: continue for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: if run.text.find(keyword) != -1: tmp.update({run:run.text}) case _: withopen(file, mode='r+', encoding='utf-8') as f: t = f.readlines() for line,text inenumerate(t): # 搜索到 if text.find(keyword) != -1: tmp.update({line+1:text[:-1]}) except Exception as e: print(f'ERROR {e}{file}') else: result.update({file:tmp}) iflen(tmp) else1
# 打印结果 for i,j inenumerate(result.keys()): print(f"\n\033[1;44m{i+1}\033[0m", f"\033[1;31m{j}\033[0m") for k in result.get(j).keys(): print(f" - \033[4m{k}\033[0m: ", result.get(j).get(k))
One More Thing
注意:只能处理纯文本文件。 用 Python 跑起来可能比较慢,于是问了 chatGPT 的 C 语言版本,在 Windows 和 macOS 上试了能用。 其实本来想用 C++ 的,但是标准库里的文件处理 api 搞不定,最后就只能请教 AI 了😉。