commit ee718fd7c3e2583229ad13bd54ac3cc948fc27cf Author: Dan Howe Date: Fri Aug 17 14:55:51 2018 +1000 Initial commit diff --git a/major_projects_grabber/__init__.py b/major_projects_grabber/__init__.py new file mode 100644 index 0000000..59c70d1 --- /dev/null +++ b/major_projects_grabber/__init__.py @@ -0,0 +1 @@ +from .major_projects_grabber import main diff --git a/major_projects_grabber/major_projects_grabber.py b/major_projects_grabber/major_projects_grabber.py new file mode 100644 index 0000000..c82a149 --- /dev/null +++ b/major_projects_grabber/major_projects_grabber.py @@ -0,0 +1,155 @@ +"""major_projects_grabber.py +Download documents from the NSW DPE Major Projects website. + +Example usage: + # Grab a single project modification using its job id, and save in 'files' + python major_projects_grabber.py -o files -i 1019 + + # Grab all modifications in search results page, and save in current folder + python major_projects_grabber.py -o . http:// + +""" + +# search url +# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16 + +# mod url +# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503 + +import os +import re +import sys +import shutil +import logging +import requests +import argparse +import pandas as pd +from lxml import html +from requests.exceptions import ConnectionError, InvalidURL + + +def get_documents(mod_id, output_dir): + """Download all documents from project modification ID + """ + + # Get html from mod page + mod_url = ('http://majorprojects.planning.nsw.gov.au/' + 'index.pl?action=view_job&job_id=' + mod_id) + mod_page = requests.get(mod_url) + mod_tree = html.fromstring(mod_page.content) + + # Get mod details + project_name = mod_tree.xpath( + '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text + mod_name = mod_tree.xpath( + '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text + + # Get list of document folders + folders = mod_tree.xpath('//div[@class="folder_row"]') + for folder in folders: + folder_name = folder.xpath('a[2]')[0].text.strip() + + # Get documents in current folder + documents = folder.xpath('ul/li/a') + for document in documents: + document_url = document.get('href') + document_name = document.text + document_path = os.path.join(output_dir, project_name, mod_name, + folder_name, document_name) + + # Create output directories as required + try: + os.makedirs(os.path.dirname(document_path), exist_ok=True) + except OSError: + logging.error(('Failed to download {}\n' + ' Project: {}\n' + ' Modification: {}\n' + ' Folder: {}\n').format( + document_name, project_name, mod_name, + folder_name)) + continue + + # Download document, if it does not already exist + if os.path.isfile(document_path): + pass + else: + try: + r = requests.get(document_url, stream=True) + except (ConnectionError, InvalidURL): + logging.error(('Failed to download {}\n' + ' Project: {}\n' + ' Modification: {}\n' + ' Folder: {}\n').format( + document_name, project_name, mod_name, + folder_name)) + continue + with open(document_path, 'wb') as f: + shutil.copyfileobj(r.raw, f) + + +def mod_ids_from_search(search_results_url): + """Get modification job IDs from search results URL""" + + # Get HTML of search results page + page = requests.get(search_results_url) + + # Create HTML tree + tree = html.fromstring(page.content) + + # Find job ids of items in results list + mod_ids = [] + mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a') + for mod in mods: + mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group()) + + return mod_ids + + +def main(): + example_text = """examples: + + # Grab a single project modification using its job id, and save in 'files' + python major_projects_grabber.py -o files -i 1019 + + # Grab all modifications in search results page, and save in current folder + python major_projects_grabber.py -o . http:// + """ + + # Set up command line arguments + parser = argparse.ArgumentParser( + epilog=example_text, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + '-o', metavar='OUTPUT_DIR', default='.', help='root output directory') + parser.add_argument( + '-i', metavar='ID', default=[], help='modification job id(s)', nargs='*') + parser.add_argument('-u', metavar='URL', help='url of search results page') + + # Print usage if no arguments are provided + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + # Parse arguments + args = parser.parse_args() + search_results_url = args.u + output_dir = args.o + mod_ids = args.i + + # Set up log File + os.makedirs(output_dir, exist_ok=True) + log_name = os.path.join(output_dir, 'errors.log') + logging.basicConfig(filename=log_name, level=logging.ERROR) + + # Get mod IDs from search results + if search_results_url: + search_mod_ids = mod_ids_from_search(search_results_url) + mod_ids.extend(search_mod_ids) + + # Download documents from given modification ids + for mod_id in mod_ids: + get_documents(mod_id, output_dir) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c01b0b6 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup + +setup( + name='major_projects_grabber', + version='0.1.0', + packages=['major_projects_grabber'], + install_requires=['requests', 'pandas', 'lxml'], + entry_points={ + 'console_scripts': [ + 'major_projects_grabber = major_projects_grabber:main' + ] + }, + author='Dan Howe', + author_email='d.howe@wrl.unsw.edu.au', + description='Download documents from the NSW DPE Major Projects website.')