Improve handling of unicode control characters.

8 years ago · ff6c503684
parent ec7ebdf1b6
commit ff6c503684
1 changed files with 25 additions and 6 deletions
--- a/pdfsearch/pdfsearch.py
+++ b/pdfsearch/pdfsearch.py
@ -4,7 +4,26 @@ import subprocess
 from tqdm import tqdm
-def parse_cli_args(kwargs):
+def parse_cli_args(**kwargs):
    """Prepare arguments for command line.
    Args:
        kwargs: dictionary containing option names and values
    Returns:
        List of arguments to pass to subprocess
    Example usage:
      > parse_cli_args(o='output.txt')
      ['-o', 'output.txt']
      > parse_cli_args(f=1, l=2)
      ['-f', '1', '-l', '2']
      > parse_cli_args(h=True)
      ['-h']
    """
    args = []
    for opt, val in kwargs.items():
        # Get option name
@ -27,7 +46,7 @@ def pdfinfo(pdf_file, **kwargs):
    """
    # Parse command line arguments
-    args = parse_cli_args(kwargs)
+    args = parse_cli_args(**kwargs)
    # Collect command line arguments
    command_str = ['pdfinfo', '-isodates', *args, pdf_file]
@ -63,7 +82,7 @@ def pdftotext(pdf_file, **kwargs):
    """
    # Parse command line arguments
-    args = parse_cli_args(kwargs)
+    args = parse_cli_args(**kwargs)
    command_str = ['pdftotext', *args, pdf_file, '-']
@ -71,7 +90,7 @@ def pdftotext(pdf_file, **kwargs):
        command_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
-        bufsize=1, )
+        bufsize=1)
    stdout = result.stdout.read().decode()
    stderr = result.stderr.read().decode()
@ -155,8 +174,8 @@ def search_pdf(pdf_name, search_patterns, context_length):
        page_text = pdftotext(pdf_name, f=page_num, l=page_num)
        # Remove unicode control characters
-        page_text = re.sub('[\x02\x03]', ' ', page_text)
+        page_text = re.sub('\r\n', '\n', page_text)
-        page_text = re.sub('\r', '', page_text)
+        page_text = re.sub('[\x02\x03\n]', ' ', page_text)
        for pattern in search_patterns:
            if pattern == pattern.lower():