You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

172 lines
5.9 KiB
Python

"""major_projects_grabber.py
Download documents from the NSW DPE Major Projects website.
Example usage:
# Grab a single project modification using its job id, and save in 'files'
major_projects_grabber -o files -i 1746
# Grab all modifications in search results page, and save in current folder
major_projects_grabber "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"
"""
import os
import re
import sys
import shutil
import logging
import requests
import argparse
import pandas as pd
from lxml import html
from tqdm import tqdm
from requests.exceptions import ConnectionError, InvalidURL
def get_documents(mod_id, output_dir):
"""Download all documents from project modification ID
"""
# Get html from mod page
mod_url = ('http://majorprojects.planning.nsw.gov.au/'
'index.pl?action=view_job&job_id=' + mod_id)
mod_page = requests.get(mod_url)
mod_tree = html.fromstring(mod_page.content)
# Get mod details
project_name = mod_tree.xpath(
'//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
mod_name = mod_tree.xpath(
'//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text
# Get list of document folders
folders = mod_tree.xpath('//div[@class="folder_row"]')
# Create modification folder
mod_dir = os.path.join(output_dir, project_name, mod_name)
os.makedirs(mod_dir, exist_ok=True)
# Add note if no documents are found on portal
if not folders:
txt_name = 'No documents on DPE portal for this modification.txt'
open(os.path.join(mod_dir, txt_name), 'a').close()
# Create link to DPE Major Projects page for current modification
text = """<html>
<meta http-equiv="refresh" content="0; url={}">
</html>""".format(mod_url)
with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
f.write(text)
for folder in folders:
folder_name = folder.xpath('a[2]')[0].text.strip()
# Get documents in current folder
documents = folder.xpath('ul/li/a')
for document in documents:
document_url = document.get('href')
document_name = document.text
document_path = os.path.join(output_dir, project_name, mod_name,
folder_name, document_name)
# Create output directories as required
try:
os.makedirs(os.path.dirname(document_path), exist_ok=True)
except OSError:
logging.error(('Failed to download {}\n'
' Project: {}\n'
' Modification: {}\n'
' Folder: {}\n').format(
document_name, project_name, mod_name,
folder_name))
continue
# Download document, if it does not already exist
if os.path.isfile(document_path):
pass
else:
try:
r = requests.get(document_url, stream=True)
except (ConnectionError, InvalidURL):
logging.error(('Failed to download {}\n'
' Project: {}\n'
' Modification: {}\n'
' Folder: {}\n').format(
document_name, project_name, mod_name,
folder_name))
continue
with open(document_path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
def mod_ids_from_search(search_results_url):
"""Get modification job IDs from search results URL"""
# Get HTML of search results page
page = requests.get(search_results_url)
# Create HTML tree
tree = html.fromstring(page.content)
# Find job ids of items in results list
mod_ids = []
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
for mod in mods:
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
return mod_ids
def main():
example_text = """examples:
# Grab a single project modification using its job id, and save in 'files'
major_projects_grabber -i 1746 -o files
# Grab all modifications in search results page, and save in current folder
major_projects_grabber http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547
"""
# Set up command line arguments
parser = argparse.ArgumentParser(
epilog=example_text,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(
'-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
parser.add_argument(
'-i',
metavar='ID',
default=[],
help='modification job id(s)',
nargs='*')
parser.add_argument('-u', metavar='URL', help='url of search results page')
# Print usage if no arguments are provided
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
# Parse arguments
args = parser.parse_args()
search_results_url = args.u
output_dir = args.o
mod_ids = args.i
# Set up log File
os.makedirs(output_dir, exist_ok=True)
log_name = os.path.join(output_dir, 'errors.log')
logging.basicConfig(filename=log_name, level=logging.ERROR)
# Get mod IDs from search results
if search_results_url:
search_mod_ids = mod_ids_from_search(search_results_url)
mod_ids.extend(search_mod_ids)
# Download documents from given modification ids
for mod_id in tqdm(mod_ids):
get_documents(mod_id, output_dir)
if __name__ == '__main__':
main()