全文搜索（Python）

尽管操作系统的搜索功能已经很完善了，但一般搜索范围仅限文件（夹）名。因此写了个能搜索文件内容的代码。

遍历目录

这里我使用了 Python 自带的 pathlib。

import pathlib
# 目录
path = r''
path = pathlib.Path(path)
# 包含子文件夹
files = path.glob('**/*')
# 不包含子文件夹
files = path.glob('*')

文本搜索

文本文件

编码默认用 utf-8，也可以改成其他的，比如 gbk、utf-16 和 unicode 等。

with open(file, mode='r+', encoding='utf-8') as f:
    t = f.readlines()
for line,text in enumerate(t):
    if text.find(keyword) != -1:
        print(f'{line} {text}')

PDF

import PyPDF2
pdf = open(file, "rb")
pdf_reader = PyPDF2.PdfReader(pdf)
for page_num in range(len(pdf_reader.pages)):
    page = pdf_reader.pages[page_num]
    if page.extract_text().find(keyword) != -1:
        tmp.update({page_num:'something'})

Word

需要安装 python-docx 库，只能读取.docx 格式，.doc 不行。

import docx
doc = docx.Document(file)
for paragraph in doc.paragraphs:
        if paragraph.text.find(keyword) != -1:
            tmp.update({paragraph:paragraph.text})
for table in doc.tables:
    for row in table.rows:
        for cell in row.cells:
            if cell.text.find(keyword) != -1:
                tmp.update({f'{cell} {row}':cell.text})

Excel

需要安装 openpyxl 库，只能读取.xlsx 格式，.xls 不行。

import openpyxl
wb = openpyxl.load_workbook(file)
for ws in wb:
    for row in ws.values:
        for value in row:
            if value.find(keyword) != -1:
                tmp.update({row:value})

PowerPoint

需要安装 python-pptx 库，只能读取.pptx 格式，.ppt 不行。

import pptx
prs = pptx.Presentation(file)
text_runs = []
for slide in prs.slides:
    for shape in slide.shapes:
        if not shape.has_text_frame:
            continue
        for paragraph in shape.text_frame.paragraphs:
            for run in paragraph.runs:
                if run.text.find(keyword) != -1:
                    tmp.update({run:run.text})

代码

import pathlib
import docx
import openpyxl
import pptx
import PyPDF2

# 要搜索的路径
path = r''
# 子目录
child = True
# 关键词
keyword = r''
# 跳过文件大小 Mb
limit = 10
# 跳过后缀名
suffix = ('')
# 结果
result = {}

path = pathlib.Path(path)
limit = limit * 1024 ** 2
# 获取文件列表
if child :
    files = path.glob('**/*')
else:
    files = path.glob('*')

# 遍历
# result = {file1:{line1:text1,line2:text2...},...}
for file in files:
    # 跳过文件夹
    if not file.is_file():
        continue
    # 跳过大文件
    if file.stat().st_size > limit:
        continue
    # 跳过大文件
    if file.suffix in suffix:
        continue

    # 报错捕捉
    try:
        tmp = {}
        match file.suffix:
            case '.pdf':
                pdf = open(file, "rb")
                pdf_reader = PyPDF2.PdfReader(pdf)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    if page.extract_text().find(keyword) != -1:
                        tmp.update({f'page {page_num+1}':keyword})
            case '.xlsx':
                wb = openpyxl.load_workbook(file)
                for ws in wb:
                    for row in ws.values:
                        for value in row:
                            if value.find(keyword) != -1:
                                tmp.update({f'row {row}':value})
            case '.docx':
                doc = docx.Document(file)
                for paragraph in doc.paragraphs:
                        if paragraph.text.find(keyword) != -1:
                            tmp.update({paragraph:paragraph.text})
                for table in doc.tables:
                    for row in table.rows:
                        for cell in row.cells:
                            if cell.text.find(keyword) != -1:
                                tmp.update({f'cell {cell} row {row}':cell.text})
            case '.pptx':
                prs = pptx.Presentation(file)
                text_runs = []
                for slide in prs.slides:
                    for shape in slide.shapes:
                        if not shape.has_text_frame:
                            continue
                        for paragraph in shape.text_frame.paragraphs:
                            for run in paragraph.runs:
                                if run.text.find(keyword) != -1:
                                    tmp.update({run:run.text})
            case _:
                with open(file, mode='r+', encoding='utf-8') as f:
                    t = f.readlines()
                for line,text in enumerate(t):
                    # 搜索到
                    if text.find(keyword) != -1:
                        tmp.update({line+1:text[:-1]})
    except Exception as e:
        print(f'ERROR {e} {file}')
    else:
        result.update({file:tmp}) if len(tmp) else 1

# 打印结果
for i,j in enumerate(result.keys()):
    print(f"\n\033[1;44m{i+1}\033[0m", f"\033[1;31m{j}\033[0m")
    for k in result.get(j).keys():
        print(f" - \033[4m{k}\033[0m: ", result.get(j).get(k))

One More Thing

注意：只能处理纯文本文件。
用 Python 跑起来可能比较慢，于是问了 chatGPT 的 C 语言版本，在 Windows 和 macOS 上试了能用。
其实本来想用 C++ 的，但是标准库里的文件处理 api 搞不定，最后就只能请教 AI 了😉。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
#include <windows.h>
#else
#include <dirent.h>
#include <sys/stat.h>
#include <unistd.h>
#endif

void traverse_directory(const char *path);

#ifdef _WIN32

void traverse_directory(const char *path) {
    WIN32_FIND_DATA findFileData;
    HANDLE hFind;
    char fullPath[MAX_PATH];

    snprintf(fullPath, sizeof(fullPath), "%s\\*", path);
    hFind = FindFirstFile(fullPath, &findFileData);

    if (hFind == INVALID_HANDLE_VALUE) {
        perror("FindFirstFile");
        return;
    }

    do {
        if (strcmp(findFileData.cFileName, ".") != 0 && strcmp(findFileData.cFileName, "..") != 0) {
            snprintf(fullPath, sizeof(fullPath), "%s\\%s", path, findFileData.cFileName);
            if (findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
                printf("%s/\n", fullPath);
                traverse_directory(fullPath);
            } else {
                printf("%s\n", fullPath);
            }
        }
    } while (FindNextFile(hFind, &findFileData) != 0);

    FindClose(hFind);
}

#else

void traverse_directory(const char *path) {
    struct dirent *entry;
    struct stat statbuf;
    DIR *dp = opendir(path);

    if (dp == NULL) {
        perror("opendir");
        return;
    }

    chdir(path);
    while ((entry = readdir(dp)) != NULL) {
        lstat(entry->d_name, &statbuf);
        if (S_ISDIR(statbuf.st_mode)) {
            if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
                continue;

            printf("%s/\n", entry->d_name);
            traverse_directory(entry->d_name);
        } else {
            printf("%s\n", entry->d_name);
        }
    }
    chdir("..");
    closedir(dp);
}

#endif

int main() {
    char path[1024];
    printf("Enter directory path: ");
    scanf("%1023s", path);

    traverse_directory(path);
    return 0;
}

本文采用 CC BY-NC-SA 4.0 许可协议，转载请注明出处，谢谢！