From ff6c503684407ee51330677d6ad921f9f8cb98c3 Mon Sep 17 00:00:00 2001
From: Dan Howe <d.howe@wrl.unsw.edu.au>
Date: Tue, 17 Apr 2018 13:22:01 +1000
Subject: [PATCH] Improve handling of unicode control characters.

---
 pdfsearch/pdfsearch.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/pdfsearch/pdfsearch.py b/pdfsearch/pdfsearch.py
index 8850733..3ec954b 100644
--- a/pdfsearch/pdfsearch.py
+++ b/pdfsearch/pdfsearch.py
@@ -4,7 +4,26 @@ import subprocess
 from tqdm import tqdm
 
 
-def parse_cli_args(kwargs):
+def parse_cli_args(**kwargs):
+    """Prepare arguments for command line.
+
+    Args:
+        kwargs: dictionary containing option names and values
+
+    Returns:
+        List of arguments to pass to subprocess
+
+    Example usage:
+      > parse_cli_args(o='output.txt')
+      ['-o', 'output.txt']
+
+      > parse_cli_args(f=1, l=2)
+      ['-f', '1', '-l', '2']
+
+      > parse_cli_args(h=True)
+      ['-h']
+    """
+
     args = []
     for opt, val in kwargs.items():
         # Get option name
@@ -27,7 +46,7 @@ def pdfinfo(pdf_file, **kwargs):
     """
 
     # Parse command line arguments
-    args = parse_cli_args(kwargs)
+    args = parse_cli_args(**kwargs)
 
     # Collect command line arguments
     command_str = ['pdfinfo', '-isodates', *args, pdf_file]
@@ -63,7 +82,7 @@ def pdftotext(pdf_file, **kwargs):
     """
 
     # Parse command line arguments
-    args = parse_cli_args(kwargs)
+    args = parse_cli_args(**kwargs)
 
     command_str = ['pdftotext', *args, pdf_file, '-']
 
@@ -71,7 +90,7 @@ def pdftotext(pdf_file, **kwargs):
         command_str,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
-        bufsize=1, )
+        bufsize=1)
     stdout = result.stdout.read().decode()
     stderr = result.stderr.read().decode()
 
@@ -155,8 +174,8 @@ def search_pdf(pdf_name, search_patterns, context_length):
         page_text = pdftotext(pdf_name, f=page_num, l=page_num)
 
         # Remove unicode control characters
-        page_text = re.sub('[\x02\x03]', ' ', page_text)
-        page_text = re.sub('\r', '', page_text)
+        page_text = re.sub('\r\n', '\n', page_text)
+        page_text = re.sub('[\x02\x03\n]', ' ', page_text)
 
         for pattern in search_patterns:
             if pattern == pattern.lower():