|
|
@ -43,6 +43,16 @@ def search_string(pattern, string, match_object, context_length):
|
|
|
|
def search_pdf(pdf_name, search_patterns, context_length):
|
|
|
|
def search_pdf(pdf_name, search_patterns, context_length):
|
|
|
|
"""Search for text strings inside a pdf.
|
|
|
|
"""Search for text strings inside a pdf.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Search patterns should be lowercase, unless case-sensitivity is important.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'ph' will match:
|
|
|
|
|
|
|
|
- photograph
|
|
|
|
|
|
|
|
- PHANTOM
|
|
|
|
|
|
|
|
- pH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'pH' will match:
|
|
|
|
|
|
|
|
- pH
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
pdf_name: path to pdf file
|
|
|
|
pdf_name: path to pdf file
|
|
|
|
search_patterns: list of re search patterns
|
|
|
|
search_patterns: list of re search patterns
|
|
|
@ -80,7 +90,7 @@ def search_pdf(pdf_name, search_patterns, context_length):
|
|
|
|
matches = re.finditer(pattern, page_text.lower())
|
|
|
|
matches = re.finditer(pattern, page_text.lower())
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
matches = re.finditer(pattern, page_text)
|
|
|
|
matches = re.finditer(pattern, page_text)
|
|
|
|
|
|
|
|
|
|
|
|
for match in matches:
|
|
|
|
for match in matches:
|
|
|
|
whole_word, context = search_string(
|
|
|
|
whole_word, context = search_string(
|
|
|
|
pattern, page_text, match, context_length)
|
|
|
|
pattern, page_text, match, context_length)
|
|
|
|