"""major_projects_grabber.py Download documents from the NSW DPE Major Projects website. Example usage: # Grab a single project modification using its job id, and save in 'files' python major_projects_grabber.py -o files -i 1019 # Grab all modifications in search results page, and save in current folder python major_projects_grabber.py -o . http:// """ # search url # http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16 # mod url # http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503 import os import re import sys import shutil import logging import requests import argparse import pandas as pd from lxml import html from requests.exceptions import ConnectionError, InvalidURL def get_documents(mod_id, output_dir): """Download all documents from project modification ID """ # Get html from mod page mod_url = ('http://majorprojects.planning.nsw.gov.au/' 'index.pl?action=view_job&job_id=' + mod_id) mod_page = requests.get(mod_url) mod_tree = html.fromstring(mod_page.content) # Get mod details project_name = mod_tree.xpath( '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text mod_name = mod_tree.xpath( '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text # Get list of document folders folders = mod_tree.xpath('//div[@class="folder_row"]') for folder in folders: folder_name = folder.xpath('a[2]')[0].text.strip() # Get documents in current folder documents = folder.xpath('ul/li/a') for document in documents: document_url = document.get('href') document_name = document.text document_path = os.path.join(output_dir, project_name, mod_name, folder_name, document_name) # Create output directories as required try: os.makedirs(os.path.dirname(document_path), exist_ok=True) except OSError: logging.error(('Failed to download {}\n' ' Project: {}\n' ' Modification: {}\n' ' Folder: {}\n').format( document_name, project_name, mod_name, folder_name)) continue # Download document, if it does not already exist if os.path.isfile(document_path): pass else: try: r = requests.get(document_url, stream=True) except (ConnectionError, InvalidURL): logging.error(('Failed to download {}\n' ' Project: {}\n' ' Modification: {}\n' ' Folder: {}\n').format( document_name, project_name, mod_name, folder_name)) continue with open(document_path, 'wb') as f: shutil.copyfileobj(r.raw, f) def mod_ids_from_search(search_results_url): """Get modification job IDs from search results URL""" # Get HTML of search results page page = requests.get(search_results_url) # Create HTML tree tree = html.fromstring(page.content) # Find job ids of items in results list mod_ids = [] mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a') for mod in mods: mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group()) return mod_ids def main(): example_text = """examples: # Grab a single project modification using its job id, and save in 'files' python major_projects_grabber.py -o files -i 1019 # Grab all modifications in search results page, and save in current folder python major_projects_grabber.py -o . http:// """ # Set up command line arguments parser = argparse.ArgumentParser( epilog=example_text, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-o', metavar='OUTPUT_DIR', default='.', help='root output directory') parser.add_argument( '-i', metavar='ID', default=[], help='modification job id(s)', nargs='*') parser.add_argument('-u', metavar='URL', help='url of search results page') # Print usage if no arguments are provided if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) # Parse arguments args = parser.parse_args() search_results_url = args.u output_dir = args.o mod_ids = args.i # Set up log File os.makedirs(output_dir, exist_ok=True) log_name = os.path.join(output_dir, 'errors.log') logging.basicConfig(filename=log_name, level=logging.ERROR) # Get mod IDs from search results if search_results_url: search_mod_ids = mod_ids_from_search(search_results_url) mod_ids.extend(search_mod_ids) # Download documents from given modification ids for mod_id in mod_ids: get_documents(mod_id, output_dir) if __name__ == '__main__': main()