From ff6c503684407ee51330677d6ad921f9f8cb98c3 Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Tue, 17 Apr 2018 13:22:01 +1000 Subject: [PATCH] Improve handling of unicode control characters. --- pdfsearch/pdfsearch.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/pdfsearch/pdfsearch.py b/pdfsearch/pdfsearch.py index 8850733..3ec954b 100644 --- a/pdfsearch/pdfsearch.py +++ b/pdfsearch/pdfsearch.py @@ -4,7 +4,26 @@ import subprocess from tqdm import tqdm -def parse_cli_args(kwargs): +def parse_cli_args(**kwargs): + """Prepare arguments for command line. + + Args: + kwargs: dictionary containing option names and values + + Returns: + List of arguments to pass to subprocess + + Example usage: + > parse_cli_args(o='output.txt') + ['-o', 'output.txt'] + + > parse_cli_args(f=1, l=2) + ['-f', '1', '-l', '2'] + + > parse_cli_args(h=True) + ['-h'] + """ + args = [] for opt, val in kwargs.items(): # Get option name @@ -27,7 +46,7 @@ def pdfinfo(pdf_file, **kwargs): """ # Parse command line arguments - args = parse_cli_args(kwargs) + args = parse_cli_args(**kwargs) # Collect command line arguments command_str = ['pdfinfo', '-isodates', *args, pdf_file] @@ -63,7 +82,7 @@ def pdftotext(pdf_file, **kwargs): """ # Parse command line arguments - args = parse_cli_args(kwargs) + args = parse_cli_args(**kwargs) command_str = ['pdftotext', *args, pdf_file, '-'] @@ -71,7 +90,7 @@ def pdftotext(pdf_file, **kwargs): command_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - bufsize=1, ) + bufsize=1) stdout = result.stdout.read().decode() stderr = result.stderr.read().decode() @@ -155,8 +174,8 @@ def search_pdf(pdf_name, search_patterns, context_length): page_text = pdftotext(pdf_name, f=page_num, l=page_num) # Remove unicode control characters - page_text = re.sub('[\x02\x03]', ' ', page_text) - page_text = re.sub('\r', '', page_text) + page_text = re.sub('\r\n', '\n', page_text) + page_text = re.sub('[\x02\x03\n]', ' ', page_text) for pattern in search_patterns: if pattern == pattern.lower():