Use 'poppler' to check if pdfs are encrypted

master
Dan Howe 7 years ago
parent fa1fe9149d
commit 835790a542

@ -34,7 +34,7 @@ then unlock the pdfs and overwrite the original versions.
""" """
__author__ = "D. Howe" __author__ = "D. Howe"
__version__ = "0.1.0" __version__ = "0.2.0"
__email__ = "d.howe@wrl.unsw.edu.au" __email__ = "d.howe@wrl.unsw.edu.au"
import os import os
@ -44,7 +44,58 @@ import argparse
import subprocess import subprocess
from tqdm import tqdm from tqdm import tqdm
def pdfinfo(pdf_name, *args):
"""Get pdf info using the poppler tool 'pdfinfo'.
Args:
pdf_name: path to pdf file
args: list of optional arguments, e.g. ['-isodates', '-v']
Returns:
A dictionary containing the lines of stdout from calling 'pdfinfo'
"""
command_str = ['pdfinfo', *args, pdf_name]
result = subprocess.run(command_str, stdout=subprocess.PIPE)
stdout = result.stdout.decode('utf-8').split('\n')
pdf_info = {}
for line in stdout:
if line:
key, val = line.split(':', 1)
pdf_info[key] = val.strip()
return pdf_info
def isencrypted(pdf_name):
"""Check if a pdf is encrypted.
Args:
pdf_name: path to pdf file
Returns:
True if encrypted, otherwise False
"""
pdf_info = pdfinfo(pdf_name)
status = pdf_info['Encrypted'].split(' ')[0]
if status == 'yes':
return True
else:
return False
def pdf_unlock(pdf_file, overwrite=False): def pdf_unlock(pdf_file, overwrite=False):
"""Rewrite pdf with Ghostscript, removing encryption.
Args:
pdf_file: path to pdf file
overwrite: boolean flag
Returns:
A dictionary containing the lines of stdout from calling 'pdfinfo'
"""
# Create suffix for unlocked pdfs # Create suffix for unlocked pdfs
suffix = '-unlocked' suffix = '-unlocked'
@ -53,7 +104,12 @@ def pdf_unlock(pdf_file, overwrite=False):
output_name = input_name + suffix output_name = input_name + suffix
# Skip file if it has already been processed # Skip file if it has already been processed
if input_name.endswith(suffix): if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'):
return
# Skip file if it is not encrypted
if not isencrypted(pdf_file):
print('file skipped')
return return
# Get name of Ghostscript executable # Get name of Ghostscript executable

Loading…
Cancel
Save