|
|
|
"""Remove security features from pdfs.
|
|
|
|
|
|
|
|
This script removes security features of pdfs inside a specified folder,
|
|
|
|
including restrictions on:
|
|
|
|
|
|
|
|
- copying text and images
|
|
|
|
- printing the document
|
|
|
|
|
|
|
|
usage: pdf_unlock.py [-h] [-r] [-o] folder
|
|
|
|
|
|
|
|
positional arguments:
|
|
|
|
folder name of input folder
|
|
|
|
|
|
|
|
|
|
|
|
optional arguments:
|
|
|
|
-h, --help show this help message and exit
|
|
|
|
-r, --recursive search for files recursively
|
|
|
|
-o, --overwrite overwrite original files
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
Search for pdfs inside 'pdf_folder', then create new unlocked versions
|
|
|
|
of the pdfs with the the suffix '-unlocked.pdf'.
|
|
|
|
> python pdf_unlock.py pdf_folder
|
|
|
|
|
|
|
|
Search for pdfs inside 'pdf_folder', then unlock the pdfs and overwrite
|
|
|
|
the original versions.
|
|
|
|
> python pdf_unlock.py pdf_folder -o
|
|
|
|
|
|
|
|
Search recursively for pdfs inside 'pdf_folder' and all subfolders,
|
|
|
|
then unlock the pdfs and overwrite the original versions.
|
|
|
|
> python pdf_unlock.py pdf_folder -o -r
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
__author__ = "D. Howe"
|
|
|
|
__version__ = "0.2.0"
|
|
|
|
__email__ = "d.howe@wrl.unsw.edu.au"
|
|
|
|
|
|
|
|
import os
|
|
|
|
import glob
|
|
|
|
import shutil
|
|
|
|
import argparse
|
|
|
|
import subprocess
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
|
|
|
def pdfinfo(pdf_name, *args):
|
|
|
|
"""Get pdf info using the poppler tool 'pdfinfo'.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
pdf_name: path to pdf file
|
|
|
|
args: list of optional arguments, e.g. ['-isodates', '-v']
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary containing the lines of stdout from calling 'pdfinfo'
|
|
|
|
"""
|
|
|
|
command_str = ['pdfinfo', *args, pdf_name]
|
|
|
|
result = subprocess.run(command_str, stdout=subprocess.PIPE)
|
|
|
|
stdout = result.stdout.decode('utf-8').split('\n')
|
|
|
|
|
|
|
|
pdf_info = {}
|
|
|
|
for line in stdout:
|
|
|
|
if line:
|
|
|
|
key, val = line.split(':', 1)
|
|
|
|
pdf_info[key] = val.strip()
|
|
|
|
|
|
|
|
return pdf_info
|
|
|
|
|
|
|
|
|
|
|
|
def isencrypted(pdf_name):
|
|
|
|
"""Check if a pdf is encrypted.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
pdf_name: path to pdf file
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
True if encrypted, otherwise False
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
AttributeError if pdf info cannot be obtained
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
pdf_info = pdfinfo(pdf_name)
|
|
|
|
status = pdf_info['Encrypted'].split(' ')[0]
|
|
|
|
if status == 'yes':
|
|
|
|
return True
|
|
|
|
elif status == 'no':
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
raise ValueError
|
|
|
|
except (SyntaxError, KeyError, ValueError):
|
|
|
|
raise AttributeError('Could not read pdf info.')
|
|
|
|
|
|
|
|
|
|
|
|
def pdf_unlock(pdf_file, overwrite=False):
|
|
|
|
"""Rewrite pdf with Ghostscript, removing encryption.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
pdf_file: path to pdf file
|
|
|
|
overwrite: boolean flag
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A dictionary containing the lines of stdout from calling 'pdfinfo'
|
|
|
|
"""
|
|
|
|
# Create suffix for unlocked pdfs
|
|
|
|
suffix = '-unlocked'
|
|
|
|
|
|
|
|
# Strip file extension
|
|
|
|
input_name = os.path.splitext(pdf_file)[0]
|
|
|
|
output_name = input_name + suffix
|
|
|
|
|
|
|
|
# Skip file if it has already been processed
|
|
|
|
if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'):
|
|
|
|
return
|
|
|
|
|
|
|
|
# Skip file if it is not encrypted
|
|
|
|
if not isencrypted(pdf_file):
|
|
|
|
return
|
|
|
|
|
|
|
|
# Get name of Ghostscript executable
|
|
|
|
if os.sys.platform == 'win32':
|
|
|
|
exec_name = 'gswin64c'
|
|
|
|
else:
|
|
|
|
exec_name = 'gs'
|
|
|
|
|
|
|
|
# Create Ghostscript command string
|
|
|
|
cmd_str = ('{} '
|
|
|
|
'-dSAFER '
|
|
|
|
'-dNOPAUSE '
|
|
|
|
'-dQUIET '
|
|
|
|
'-dBATCH '
|
|
|
|
'-sPDFPassword= '
|
|
|
|
'-sDEVICE=pdfwrite '
|
|
|
|
'-sOutputFile="{}.pdf" '
|
|
|
|
'"{}.pdf"').format(exec_name, output_name, input_name)
|
|
|
|
|
|
|
|
with subprocess.Popen(
|
|
|
|
cmd_str, stdout=subprocess.PIPE, bufsize=1,
|
|
|
|
universal_newlines=True) as p:
|
|
|
|
for line in p.stdout:
|
|
|
|
print(line, end='')
|
|
|
|
|
|
|
|
# Overwrite original file
|
|
|
|
if overwrite:
|
|
|
|
shutil.move(output_name + '.pdf', input_name + '.pdf')
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('folder', help='name of input folder', default=None)
|
|
|
|
parser.add_argument(
|
|
|
|
'-r',
|
|
|
|
'--recursive',
|
|
|
|
help='search for files recursively',
|
|
|
|
action='store_true')
|
|
|
|
parser.add_argument(
|
|
|
|
'-o',
|
|
|
|
'--overwrite',
|
|
|
|
help='overwrite original files',
|
|
|
|
action='store_true')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# Get pdf files
|
|
|
|
if args.recursive:
|
|
|
|
glob_str = args.folder + '/**/*.pdf'
|
|
|
|
else:
|
|
|
|
glob_str = args.folder + '/*.pdf'
|
|
|
|
|
|
|
|
pdf_files = glob.glob(glob_str, recursive=args.recursive)
|
|
|
|
|
|
|
|
pbar = tqdm(pdf_files)
|
|
|
|
for pdf_file in pbar:
|
|
|
|
pdf_file_shortname = os.path.split(pdf_file)[1]
|
|
|
|
pbar.set_description('Processing {}'.format(pdf_file_shortname))
|
|
|
|
pdf_unlock(pdf_file, overwrite=args.overwrite)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|