Improve handling of unicode control characters.

master
Dan Howe 7 years ago
parent ec7ebdf1b6
commit ff6c503684

@ -4,7 +4,26 @@ import subprocess
from tqdm import tqdm from tqdm import tqdm
def parse_cli_args(kwargs): def parse_cli_args(**kwargs):
"""Prepare arguments for command line.
Args:
kwargs: dictionary containing option names and values
Returns:
List of arguments to pass to subprocess
Example usage:
> parse_cli_args(o='output.txt')
['-o', 'output.txt']
> parse_cli_args(f=1, l=2)
['-f', '1', '-l', '2']
> parse_cli_args(h=True)
['-h']
"""
args = [] args = []
for opt, val in kwargs.items(): for opt, val in kwargs.items():
# Get option name # Get option name
@ -27,7 +46,7 @@ def pdfinfo(pdf_file, **kwargs):
""" """
# Parse command line arguments # Parse command line arguments
args = parse_cli_args(kwargs) args = parse_cli_args(**kwargs)
# Collect command line arguments # Collect command line arguments
command_str = ['pdfinfo', '-isodates', *args, pdf_file] command_str = ['pdfinfo', '-isodates', *args, pdf_file]
@ -63,7 +82,7 @@ def pdftotext(pdf_file, **kwargs):
""" """
# Parse command line arguments # Parse command line arguments
args = parse_cli_args(kwargs) args = parse_cli_args(**kwargs)
command_str = ['pdftotext', *args, pdf_file, '-'] command_str = ['pdftotext', *args, pdf_file, '-']
@ -71,7 +90,7 @@ def pdftotext(pdf_file, **kwargs):
command_str, command_str,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
bufsize=1, ) bufsize=1)
stdout = result.stdout.read().decode() stdout = result.stdout.read().decode()
stderr = result.stderr.read().decode() stderr = result.stderr.read().decode()
@ -155,8 +174,8 @@ def search_pdf(pdf_name, search_patterns, context_length):
page_text = pdftotext(pdf_name, f=page_num, l=page_num) page_text = pdftotext(pdf_name, f=page_num, l=page_num)
# Remove unicode control characters # Remove unicode control characters
page_text = re.sub('[\x02\x03]', ' ', page_text) page_text = re.sub('\r\n', '\n', page_text)
page_text = re.sub('\r', '', page_text) page_text = re.sub('[\x02\x03\n]', ' ', page_text)
for pattern in search_patterns: for pattern in search_patterns:
if pattern == pattern.lower(): if pattern == pattern.lower():

Loading…
Cancel
Save