"""major_projects_grabber.py Download documents from the NSW DPE Major Projects website. Example usage: # Grab a single project modification using its job id, and save in 'files' major_projects_grabber -o files -i 1746 # Grab all modifications in search results page, and save in current folder major_projects_grabber "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547" """ import os import re import sys import shutil import logging import requests import argparse import pandas as pd from lxml import html from tqdm import tqdm from requests.exceptions import ConnectionError, InvalidURL def get_documents(mod_id, output_dir): """Download all documents from project modification ID """ # Get html from mod page mod_url = ('http://majorprojects.planning.nsw.gov.au/' 'index.pl?action=view_job&job_id=' + mod_id) mod_page = requests.get(mod_url) mod_tree = html.fromstring(mod_page.content) # Get mod details project_name = mod_tree.xpath( '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text mod_name = mod_tree.xpath( '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text # Get list of document folders folders = mod_tree.xpath('//div[@class="folder_row"]') # Create modification folder mod_dir = os.path.join(output_dir, project_name, mod_name) os.makedirs(mod_dir, exist_ok=True) # Add note if no documents are found on portal if not folders: txt_name = 'No documents on DPE portal for this modification.txt' open(os.path.join(mod_dir, txt_name), 'a').close() # Create link to DPE Major Projects page for current modification text = """ """.format(mod_url) with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f: f.write(text) for folder in folders: folder_name = folder.xpath('a[2]')[0].text.strip() # Get documents in current folder documents = folder.xpath('ul/li/a') for document in documents: document_url = document.get('href') document_name = document.text document_path = os.path.join(output_dir, project_name, mod_name, folder_name, document_name) # Create output directories as required try: os.makedirs(os.path.dirname(document_path), exist_ok=True) except OSError: logging.error(('Failed to download {}\n' ' Project: {}\n' ' Modification: {}\n' ' Folder: {}\n').format( document_name, project_name, mod_name, folder_name)) continue # Download document, if it does not already exist if os.path.isfile(document_path): pass else: try: r = requests.get(document_url, stream=True) except (ConnectionError, InvalidURL): logging.error(('Failed to download {}\n' ' Project: {}\n' ' Modification: {}\n' ' Folder: {}\n').format( document_name, project_name, mod_name, folder_name)) continue with open(document_path, 'wb') as f: shutil.copyfileobj(r.raw, f) def mod_ids_from_search(search_results_url): """Get modification job IDs from search results URL""" # Get HTML of search results page page = requests.get(search_results_url) # Create HTML tree tree = html.fromstring(page.content) # Find job ids of items in results list mod_ids = [] mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a') for mod in mods: mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group()) return mod_ids def main(): example_text = """examples: # Grab a single project modification using its job id, and save in 'files' major_projects_grabber -i 1746 -o files # Grab all modifications in search results page, and save in current folder major_projects_grabber http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547 """ # Set up command line arguments parser = argparse.ArgumentParser( epilog=example_text, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-o', metavar='OUTPUT_DIR', default='.', help='root output directory') parser.add_argument( '-i', metavar='ID', default=[], help='modification job id(s)', nargs='*') parser.add_argument('-u', metavar='URL', help='url of search results page') # Print usage if no arguments are provided if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) # Parse arguments args = parser.parse_args() search_results_url = args.u output_dir = args.o mod_ids = args.i # Set up log File os.makedirs(output_dir, exist_ok=True) log_name = os.path.join(output_dir, 'errors.log') logging.basicConfig(filename=log_name, level=logging.ERROR) # Get mod IDs from search results if search_results_url: search_mod_ids = mod_ids_from_search(search_results_url) mod_ids.extend(search_mod_ids) # Download documents from given modification ids for mod_id in tqdm(mod_ids): get_documents(mod_id, output_dir) if __name__ == '__main__': main()