尽管操作系统的搜索功能已经很完善了,但一般搜索范围仅限文件(夹)名。因此写了个能搜索文件内容的代码。

遍历目录

这里我使用了 Python 自带的 pathlib

1
2
3
4
5
6
7
8
import pathlib
# 目录
path = r''
path = pathlib.Path(path)
# 包含子文件夹
files = path.glob('**/*')
# 不包含子文件夹
files = path.glob('*')

文本搜索

文本文件

编码默认用 utf-8,也可以改成其他的,比如 gbkutf-16unicode 等。

1
2
3
4
5
with open(file, mode='r+', encoding='utf-8') as f:
t = f.readlines()
for line,text in enumerate(t):
if text.find(keyword) != -1:
print(f'{line} {text}')

PDF

1
2
3
4
5
6
7
import PyPDF2
pdf = open(file, "rb")
pdf_reader = PyPDF2.PdfReader(pdf)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
if page.extract_text().find(keyword) != -1:
tmp.update({page_num:'something'})

Word

需要安装 python-docx 库,只能读取.docx 格式,.doc 不行。

1
2
3
4
5
6
7
8
9
10
import docx
doc = docx.Document(file)
for paragraph in doc.paragraphs:
if paragraph.text.find(keyword) != -1:
tmp.update({paragraph:paragraph.text})
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.find(keyword) != -1:
tmp.update({f'{cell} {row}':cell.text})

Excel

需要安装 openpyxl 库,只能读取.xlsx 格式,.xls 不行。

1
2
3
4
5
6
7
import openpyxl
wb = openpyxl.load_workbook(file)
for ws in wb:
for row in ws.values:
for value in row:
if value.find(keyword) != -1:
tmp.update({row:value})

PowerPoint

需要安装 python-pptx 库,只能读取.pptx 格式,.ppt 不行。

1
2
3
4
5
6
7
8
9
10
11
import pptx
prs = pptx.Presentation(file)
text_runs = []
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
if run.text.find(keyword) != -1:
tmp.update({run:run.text})

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pathlib
import docx
import openpyxl
import pptx
import PyPDF2

# 要搜索的路径
path = r''
# 子目录
child = True
# 关键词
keyword = r''
# 跳过文件大小 Mb
limit = 10
# 跳过后缀名
suffix = ('')
# 结果
result = {}

path = pathlib.Path(path)
limit = limit * 1024 ** 2
# 获取文件列表
if child :
files = path.glob('**/*')
else:
files = path.glob('*')

# 遍历
# result = {file1:{line1:text1,line2:text2...},...}
for file in files:
# 跳过文件夹
if not file.is_file():
continue
# 跳过大文件
if file.stat().st_size > limit:
continue
# 跳过大文件
if file.suffix in suffix:
continue

# 报错捕捉
try:
tmp = {}
match file.suffix:
case '.pdf':
pdf = open(file, "rb")
pdf_reader = PyPDF2.PdfReader(pdf)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
if page.extract_text().find(keyword) != -1:
tmp.update({f'page {page_num+1}':keyword})
case '.xlsx':
wb = openpyxl.load_workbook(file)
for ws in wb:
for row in ws.values:
for value in row:
if value.find(keyword) != -1:
tmp.update({f'row {row}':value})
case '.docx':
doc = docx.Document(file)
for paragraph in doc.paragraphs:
if paragraph.text.find(keyword) != -1:
tmp.update({paragraph:paragraph.text})
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.find(keyword) != -1:
tmp.update({f'cell {cell} row {row}':cell.text})
case '.pptx':
prs = pptx.Presentation(file)
text_runs = []
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
if run.text.find(keyword) != -1:
tmp.update({run:run.text})
case _:
with open(file, mode='r+', encoding='utf-8') as f:
t = f.readlines()
for line,text in enumerate(t):
# 搜索到
if text.find(keyword) != -1:
tmp.update({line+1:text[:-1]})
except Exception as e:
print(f'ERROR {e} {file}')
else:
result.update({file:tmp}) if len(tmp) else 1

# 打印结果
for i,j in enumerate(result.keys()):
print(f"\n\033[1;44m{i+1}\033[0m", f"\033[1;31m{j}\033[0m")
for k in result.get(j).keys():
print(f" - \033[4m{k}\033[0m: ", result.get(j).get(k))

One More Thing

注意:只能处理纯文本文件。
Python 跑起来可能比较慢,于是问了 chatGPT 的 C 语言版本,在 Windows 和 macOS 上试了能用。
其实本来想用 C++ 的,但是标准库里的文件处理 api 搞不定,最后就只能请教 AI 了😉。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
#include <windows.h>
#else
#include <dirent.h>
#include <sys/stat.h>
#include <unistd.h>
#endif

void traverse_directory(const char *path);

#ifdef _WIN32

void traverse_directory(const char *path) {
WIN32_FIND_DATA findFileData;
HANDLE hFind;
char fullPath[MAX_PATH];

snprintf(fullPath, sizeof(fullPath), "%s\\*", path);
hFind = FindFirstFile(fullPath, &findFileData);

if (hFind == INVALID_HANDLE_VALUE) {
perror("FindFirstFile");
return;
}

do {
if (strcmp(findFileData.cFileName, ".") != 0 && strcmp(findFileData.cFileName, "..") != 0) {
snprintf(fullPath, sizeof(fullPath), "%s\\%s", path, findFileData.cFileName);
if (findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
printf("%s/\n", fullPath);
traverse_directory(fullPath);
} else {
printf("%s\n", fullPath);
}
}
} while (FindNextFile(hFind, &findFileData) != 0);

FindClose(hFind);
}

#else

void traverse_directory(const char *path) {
struct dirent *entry;
struct stat statbuf;
DIR *dp = opendir(path);

if (dp == NULL) {
perror("opendir");
return;
}

chdir(path);
while ((entry = readdir(dp)) != NULL) {
lstat(entry->d_name, &statbuf);
if (S_ISDIR(statbuf.st_mode)) {
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;

printf("%s/\n", entry->d_name);
traverse_directory(entry->d_name);
} else {
printf("%s\n", entry->d_name);
}
}
chdir("..");
closedir(dp);
}

#endif

int main() {
char path[1024];
printf("Enter directory path: ");
scanf("%1023s", path);

traverse_directory(path);
return 0;
}

网站地图 | 状态监测 | 图片加密&解密 | File Server | 博友圈 | 博客说
Copyright 2022-2025 | Powered by Hexo 7.3.0 & Stellar 1.29.1
总访问量次 |