You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.2 KiB
Python

import os
import re
import PyPDF2
from PyPDF2.utils import PdfReadError
from zlib import error as ZLibError
from tqdm import tqdm
def search_string(pattern, string, match_object, context_length):
"""Get details of SRE_Match object.
Args:
pattern: re search pattern
string: re search string
match_object: SRE_Match object
context_length: number of words to include before and after
Returns:
whole_word: word around pattern match, including prefix and suffix
context: words adjacent to pattern match
"""
idx_1 = match_object.start()
idx_2 = match_object.end()
# Get substrings before and after search result
str_before = string[:idx_1]
str_after = string[idx_2:]
# Get actual word match
str_match_prefix = string[:idx_1].split(' ')[-1:][0]
str_match_suffix = string[idx_2:].split(' ')[:1][0]
whole_word = str_match_prefix + pattern + str_match_suffix
# Get word in context
words_before = str_before.split(' ')[-1 - context_length:-1]
words_after = str_after.split(' ')[1:context_length + 1]
context = ' '.join(words_before + [whole_word] + words_after)
return whole_word, context
def search_pdf(pdf_name, search_patterns, context_length):
"""Search for text strings inside a pdf.
Args:
pdf_name: path to pdf file
search_patterns: list of re search patterns
context_length: number of words to include before and after
Returns:
A list of dictionaries containing search results
"""
# Convert search patterns to list, if a single search string is provided
if isinstance(search_patterns, str):
search_patterns = [search_patterns]
search_results = []
with open(pdf_name, 'rb') as pdf:
pdf_shortname = os.path.split(pdf_name)[1]
reader = PyPDF2.PdfFileReader(pdf)
num_pages = reader.getNumPages()
# Set up tqdm progress bar
pbar = tqdm(range(num_pages))
for i in pbar:
pbar.set_description('Searching {}'.format(pdf_shortname))
page_num = i + 1
page = reader.getPage(i)
try:
page_text = page.extractText().replace('\n', '')
# Skip page if text cannot be read
except (KeyError, ZLibError):
continue
for pattern in search_patterns:
if pattern == pattern.lower():
# Find case-insensitive matches if pattern is all lowercase
matches = re.finditer(pattern, page_text.lower())
else:
matches = re.finditer(pattern, page_text)
for match in matches:
whole_word, context = search_string(
pattern, page_text, match, context_length)
# Update search results
search_results.append({
'document': pdf_name,
'page': page_num,
'pattern': pattern,
'word': whole_word,
'context': context,
})
return search_results