Initial commit
commit
ee718fd7c3
@ -0,0 +1 @@
|
||||
from .major_projects_grabber import main
|
@ -0,0 +1,155 @@
|
||||
"""major_projects_grabber.py
|
||||
Download documents from the NSW DPE Major Projects website.
|
||||
|
||||
Example usage:
|
||||
# Grab a single project modification using its job id, and save in 'files'
|
||||
python major_projects_grabber.py -o files -i 1019
|
||||
|
||||
# Grab all modifications in search results page, and save in current folder
|
||||
python major_projects_grabber.py -o . http://
|
||||
|
||||
"""
|
||||
|
||||
# search url
|
||||
# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16
|
||||
|
||||
# mod url
|
||||
# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import logging
|
||||
import requests
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from lxml import html
|
||||
from requests.exceptions import ConnectionError, InvalidURL
|
||||
|
||||
|
||||
def get_documents(mod_id, output_dir):
|
||||
"""Download all documents from project modification ID
|
||||
"""
|
||||
|
||||
# Get html from mod page
|
||||
mod_url = ('http://majorprojects.planning.nsw.gov.au/'
|
||||
'index.pl?action=view_job&job_id=' + mod_id)
|
||||
mod_page = requests.get(mod_url)
|
||||
mod_tree = html.fromstring(mod_page.content)
|
||||
|
||||
# Get mod details
|
||||
project_name = mod_tree.xpath(
|
||||
'//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
|
||||
mod_name = mod_tree.xpath(
|
||||
'//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text
|
||||
|
||||
# Get list of document folders
|
||||
folders = mod_tree.xpath('//div[@class="folder_row"]')
|
||||
for folder in folders:
|
||||
folder_name = folder.xpath('a[2]')[0].text.strip()
|
||||
|
||||
# Get documents in current folder
|
||||
documents = folder.xpath('ul/li/a')
|
||||
for document in documents:
|
||||
document_url = document.get('href')
|
||||
document_name = document.text
|
||||
document_path = os.path.join(output_dir, project_name, mod_name,
|
||||
folder_name, document_name)
|
||||
|
||||
# Create output directories as required
|
||||
try:
|
||||
os.makedirs(os.path.dirname(document_path), exist_ok=True)
|
||||
except OSError:
|
||||
logging.error(('Failed to download {}\n'
|
||||
' Project: {}\n'
|
||||
' Modification: {}\n'
|
||||
' Folder: {}\n').format(
|
||||
document_name, project_name, mod_name,
|
||||
folder_name))
|
||||
continue
|
||||
|
||||
# Download document, if it does not already exist
|
||||
if os.path.isfile(document_path):
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
r = requests.get(document_url, stream=True)
|
||||
except (ConnectionError, InvalidURL):
|
||||
logging.error(('Failed to download {}\n'
|
||||
' Project: {}\n'
|
||||
' Modification: {}\n'
|
||||
' Folder: {}\n').format(
|
||||
document_name, project_name, mod_name,
|
||||
folder_name))
|
||||
continue
|
||||
with open(document_path, 'wb') as f:
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
|
||||
|
||||
def mod_ids_from_search(search_results_url):
|
||||
"""Get modification job IDs from search results URL"""
|
||||
|
||||
# Get HTML of search results page
|
||||
page = requests.get(search_results_url)
|
||||
|
||||
# Create HTML tree
|
||||
tree = html.fromstring(page.content)
|
||||
|
||||
# Find job ids of items in results list
|
||||
mod_ids = []
|
||||
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
|
||||
for mod in mods:
|
||||
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
|
||||
|
||||
return mod_ids
|
||||
|
||||
|
||||
def main():
|
||||
example_text = """examples:
|
||||
|
||||
# Grab a single project modification using its job id, and save in 'files'
|
||||
python major_projects_grabber.py -o files -i 1019
|
||||
|
||||
# Grab all modifications in search results page, and save in current folder
|
||||
python major_projects_grabber.py -o . http://
|
||||
"""
|
||||
|
||||
# Set up command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
epilog=example_text,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
parser.add_argument(
|
||||
'-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
|
||||
parser.add_argument(
|
||||
'-i', metavar='ID', default=[], help='modification job id(s)', nargs='*')
|
||||
parser.add_argument('-u', metavar='URL', help='url of search results page')
|
||||
|
||||
# Print usage if no arguments are provided
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse arguments
|
||||
args = parser.parse_args()
|
||||
search_results_url = args.u
|
||||
output_dir = args.o
|
||||
mod_ids = args.i
|
||||
|
||||
# Set up log File
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
log_name = os.path.join(output_dir, 'errors.log')
|
||||
logging.basicConfig(filename=log_name, level=logging.ERROR)
|
||||
|
||||
# Get mod IDs from search results
|
||||
if search_results_url:
|
||||
search_mod_ids = mod_ids_from_search(search_results_url)
|
||||
mod_ids.extend(search_mod_ids)
|
||||
|
||||
# Download documents from given modification ids
|
||||
for mod_id in mod_ids:
|
||||
get_documents(mod_id, output_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,15 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='major_projects_grabber',
|
||||
version='0.1.0',
|
||||
packages=['major_projects_grabber'],
|
||||
install_requires=['requests', 'pandas', 'lxml'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'major_projects_grabber = major_projects_grabber:main'
|
||||
]
|
||||
},
|
||||
author='Dan Howe',
|
||||
author_email='d.howe@wrl.unsw.edu.au',
|
||||
description='Download documents from the NSW DPE Major Projects website.')
|
Loading…
Reference in New Issue