You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

175 lines
4.4 KiB
Python

"""Remove security features from pdfs.
This script removes security features of pdfs inside a specified folder,
including restrictions on:
- copying text and images
- printing the document
usage: pdf_unlock.py [-h] [-r] [-o] folder
positional arguments:
folder name of input folder
optional arguments:
-h, --help show this help message and exit
-r, --recursive search for files recursively
-o, --overwrite overwrite original files
Examples:
Search for pdfs inside 'pdf_folder', then create new unlocked versions
of the pdfs with the the suffix '-unlocked.pdf'.
> python pdf_unlock.py pdf_folder
Search for pdfs inside 'pdf_folder', then unlock the pdfs and overwrite
the original versions.
> python pdf_unlock.py pdf_folder -o
Search recursively for pdfs inside 'pdf_folder' and all subfolders,
then unlock the pdfs and overwrite the original versions.
> python pdf_unlock.py pdf_folder -o -r
"""
__author__ = "D. Howe"
__version__ = "0.2.0"
__email__ = "d.howe@wrl.unsw.edu.au"
import os
import glob
import shutil
import argparse
import subprocess
from tqdm import tqdm
def pdfinfo(pdf_name, *args):
"""Get pdf info using the poppler tool 'pdfinfo'.
Args:
pdf_name: path to pdf file
args: list of optional arguments, e.g. ['-isodates', '-v']
Returns:
A dictionary containing the lines of stdout from calling 'pdfinfo'
"""
command_str = ['pdfinfo', *args, pdf_name]
result = subprocess.run(command_str, stdout=subprocess.PIPE)
stdout = result.stdout.decode('utf-8').split('\n')
pdf_info = {}
for line in stdout:
if line:
key, val = line.split(':', 1)
pdf_info[key] = val.strip()
return pdf_info
def isencrypted(pdf_name):
"""Check if a pdf is encrypted.
Args:
pdf_name: path to pdf file
Returns:
True if encrypted, otherwise False
"""
pdf_info = pdfinfo(pdf_name)
status = pdf_info['Encrypted'].split(' ')[0]
if status == 'yes':
return True
else:
return False
def pdf_unlock(pdf_file, overwrite=False):
"""Rewrite pdf with Ghostscript, removing encryption.
Args:
pdf_file: path to pdf file
overwrite: boolean flag
Returns:
A dictionary containing the lines of stdout from calling 'pdfinfo'
"""
# Create suffix for unlocked pdfs
suffix = '-unlocked'
# Strip file extension
input_name = os.path.splitext(pdf_file)[0]
output_name = input_name + suffix
# Skip file if it has already been processed
if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'):
return
# Skip file if it is not encrypted
if not isencrypted(pdf_file):
print('file skipped')
return
# Get name of Ghostscript executable
if os.sys.platform == 'win32':
exec_name = 'gswin64c'
else:
exec_name = 'gs'
# Create Ghostscript command string
cmd_str = ('{} '
'-dSAFER '
'-dNOPAUSE '
'-dQUIET '
'-dBATCH '
'-sPDFPassword= '
'-sDEVICE=pdfwrite '
'-sOutputFile="{}.pdf" '
'"{}.pdf"').format(exec_name, output_name, input_name)
with subprocess.Popen(
cmd_str, stdout=subprocess.PIPE, bufsize=1,
universal_newlines=True) as p:
for line in p.stdout:
print(line, end='')
# Overwrite original file
if overwrite:
shutil.move(output_name + '.pdf', input_name + '.pdf')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('folder', help='name of input folder', default=None)
parser.add_argument(
'-r',
'--recursive',
help='search for files recursively',
action='store_true')
parser.add_argument(
'-o',
'--overwrite',
help='overwrite original files',
action='store_true')
args = parser.parse_args()
# Get pdf files
if args.recursive:
glob_str = args.folder + '/**/*.pdf'
else:
glob_str = args.folder + '/*.pdf'
pdf_files = glob.glob(glob_str, recursive=args.recursive)
pbar = tqdm(pdf_files)
for pdf_file in pbar:
pdf_file_shortname = os.path.split(pdf_file)[1]
pbar.set_description('Processing {}'.format(pdf_file_shortname))
pdf_unlock(pdf_file, overwrite=args.overwrite)
if __name__ == '__main__':
main()