|
|
|
@ -4,7 +4,26 @@ import subprocess
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_cli_args(kwargs):
|
|
|
|
|
def parse_cli_args(**kwargs):
|
|
|
|
|
"""Prepare arguments for command line.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
kwargs: dictionary containing option names and values
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of arguments to pass to subprocess
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
> parse_cli_args(o='output.txt')
|
|
|
|
|
['-o', 'output.txt']
|
|
|
|
|
|
|
|
|
|
> parse_cli_args(f=1, l=2)
|
|
|
|
|
['-f', '1', '-l', '2']
|
|
|
|
|
|
|
|
|
|
> parse_cli_args(h=True)
|
|
|
|
|
['-h']
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
args = []
|
|
|
|
|
for opt, val in kwargs.items():
|
|
|
|
|
# Get option name
|
|
|
|
@ -27,7 +46,7 @@ def pdfinfo(pdf_file, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Parse command line arguments
|
|
|
|
|
args = parse_cli_args(kwargs)
|
|
|
|
|
args = parse_cli_args(**kwargs)
|
|
|
|
|
|
|
|
|
|
# Collect command line arguments
|
|
|
|
|
command_str = ['pdfinfo', '-isodates', *args, pdf_file]
|
|
|
|
@ -63,7 +82,7 @@ def pdftotext(pdf_file, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Parse command line arguments
|
|
|
|
|
args = parse_cli_args(kwargs)
|
|
|
|
|
args = parse_cli_args(**kwargs)
|
|
|
|
|
|
|
|
|
|
command_str = ['pdftotext', *args, pdf_file, '-']
|
|
|
|
|
|
|
|
|
@ -71,7 +90,7 @@ def pdftotext(pdf_file, **kwargs):
|
|
|
|
|
command_str,
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
|
bufsize=1, )
|
|
|
|
|
bufsize=1)
|
|
|
|
|
stdout = result.stdout.read().decode()
|
|
|
|
|
stderr = result.stderr.read().decode()
|
|
|
|
|
|
|
|
|
@ -155,8 +174,8 @@ def search_pdf(pdf_name, search_patterns, context_length):
|
|
|
|
|
page_text = pdftotext(pdf_name, f=page_num, l=page_num)
|
|
|
|
|
|
|
|
|
|
# Remove unicode control characters
|
|
|
|
|
page_text = re.sub('[\x02\x03]', ' ', page_text)
|
|
|
|
|
page_text = re.sub('\r', '', page_text)
|
|
|
|
|
page_text = re.sub('\r\n', '\n', page_text)
|
|
|
|
|
page_text = re.sub('[\x02\x03\n]', ' ', page_text)
|
|
|
|
|
|
|
|
|
|
for pattern in search_patterns:
|
|
|
|
|
if pattern == pattern.lower():
|
|
|
|
|