diff --git a/pdfsearch/pdfsearch.py b/pdfsearch/pdfsearch.py index 1c7d403..6ae045a 100644 --- a/pdfsearch/pdfsearch.py +++ b/pdfsearch/pdfsearch.py @@ -43,6 +43,16 @@ def search_string(pattern, string, match_object, context_length): def search_pdf(pdf_name, search_patterns, context_length): """Search for text strings inside a pdf. + Search patterns should be lowercase, unless case-sensitivity is important. + + 'ph' will match: + - photograph + - PHANTOM + - pH + + 'pH' will match: + - pH + Args: pdf_name: path to pdf file search_patterns: list of re search patterns @@ -80,7 +90,7 @@ def search_pdf(pdf_name, search_patterns, context_length): matches = re.finditer(pattern, page_text.lower()) else: matches = re.finditer(pattern, page_text) - + for match in matches: whole_word, context = search_string( pattern, page_text, match, context_length)