|
|
|
@ -17,15 +17,31 @@ import shutil
|
|
|
|
|
import logging
|
|
|
|
|
import requests
|
|
|
|
|
import argparse
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from lxml import html
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
from requests.exceptions import ConnectionError, InvalidURL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_documents(mod_id, output_dir):
|
|
|
|
|
"""Download all documents from project modification ID
|
|
|
|
|
"""
|
|
|
|
|
def mod_ids_from_search(search_results_url):
|
|
|
|
|
"""Get modification job IDs from search results URL"""
|
|
|
|
|
|
|
|
|
|
# Get HTML of search results page
|
|
|
|
|
page = requests.get(search_results_url)
|
|
|
|
|
|
|
|
|
|
# Create HTML tree
|
|
|
|
|
tree = html.fromstring(page.content)
|
|
|
|
|
|
|
|
|
|
# Find job ids of items in results list
|
|
|
|
|
mod_ids = []
|
|
|
|
|
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
|
|
|
|
|
for mod in mods:
|
|
|
|
|
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
|
|
|
|
|
|
|
|
|
|
return mod_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_document_list(mod_id, output_dir):
|
|
|
|
|
"""Get list of documents from project modification ID"""
|
|
|
|
|
|
|
|
|
|
# Get html from mod page
|
|
|
|
|
mod_url = ('http://majorprojects.planning.nsw.gov.au/'
|
|
|
|
@ -58,65 +74,55 @@ def get_documents(mod_id, output_dir):
|
|
|
|
|
with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
|
|
|
|
|
f.write(text)
|
|
|
|
|
|
|
|
|
|
document_data = []
|
|
|
|
|
for folder in folders:
|
|
|
|
|
folder_name = folder.xpath('a[2]')[0].text.strip()
|
|
|
|
|
|
|
|
|
|
# Get documents in current folder
|
|
|
|
|
documents = folder.xpath('ul/li/a')
|
|
|
|
|
for document in documents:
|
|
|
|
|
document_url = document.get('href')
|
|
|
|
|
document_name = document.text
|
|
|
|
|
document_path = os.path.join(output_dir, project_name, mod_name,
|
|
|
|
|
folder_name, document_name)
|
|
|
|
|
doc = {}
|
|
|
|
|
doc['url'] = document.get('href')
|
|
|
|
|
doc['name'] = document.text
|
|
|
|
|
doc['document_path'] = os.path.join(
|
|
|
|
|
output_dir, project_name, mod_name, folder_name, doc['name'])
|
|
|
|
|
document_data.append(doc)
|
|
|
|
|
|
|
|
|
|
return document_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def download_document(url, document_path):
|
|
|
|
|
"""Download document from given url"""
|
|
|
|
|
|
|
|
|
|
# Create output directories as required
|
|
|
|
|
try:
|
|
|
|
|
os.makedirs(os.path.dirname(document_path), exist_ok=True)
|
|
|
|
|
except OSError:
|
|
|
|
|
logging.error(('Failed to download {}\n'
|
|
|
|
|
' Project: {}\n'
|
|
|
|
|
' Modification: {}\n'
|
|
|
|
|
' Folder: {}\n').format(
|
|
|
|
|
document_name, project_name, mod_name,
|
|
|
|
|
folder_name))
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Download document, if it does not already exist
|
|
|
|
|
logging.error(('Failed to download {4}\n'
|
|
|
|
|
' Project: {1}\n'
|
|
|
|
|
' Modification: {2}\n'
|
|
|
|
|
' Folder: {3}\n').format(*document_path.split(os.sep)))
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Check if file exists
|
|
|
|
|
if os.path.isfile(document_path):
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
r = requests.get(document_url, stream=True)
|
|
|
|
|
# Attempt to download file
|
|
|
|
|
r = requests.get(url, stream=True)
|
|
|
|
|
except (ConnectionError, InvalidURL):
|
|
|
|
|
logging.error(('Failed to download {}\n'
|
|
|
|
|
' Project: {}\n'
|
|
|
|
|
' Modification: {}\n'
|
|
|
|
|
' Folder: {}\n').format(
|
|
|
|
|
document_name, project_name, mod_name,
|
|
|
|
|
folder_name))
|
|
|
|
|
continue
|
|
|
|
|
logging.error(
|
|
|
|
|
('Failed to download {4}\n'
|
|
|
|
|
' Project: {1}\n'
|
|
|
|
|
' Modification: {2}\n'
|
|
|
|
|
' Folder: {3}\n').format(*document_path.split(os.sep)))
|
|
|
|
|
return
|
|
|
|
|
# Write file to disk
|
|
|
|
|
with open(document_path, 'wb') as f:
|
|
|
|
|
shutil.copyfileobj(r.raw, f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mod_ids_from_search(search_results_url):
|
|
|
|
|
"""Get modification job IDs from search results URL"""
|
|
|
|
|
|
|
|
|
|
# Get HTML of search results page
|
|
|
|
|
page = requests.get(search_results_url)
|
|
|
|
|
|
|
|
|
|
# Create HTML tree
|
|
|
|
|
tree = html.fromstring(page.content)
|
|
|
|
|
|
|
|
|
|
# Find job ids of items in results list
|
|
|
|
|
mod_ids = []
|
|
|
|
|
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
|
|
|
|
|
for mod in mods:
|
|
|
|
|
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
|
|
|
|
|
|
|
|
|
|
return mod_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
example_text = """examples:
|
|
|
|
|
|
|
|
|
@ -162,9 +168,26 @@ def main():
|
|
|
|
|
search_mod_ids = mod_ids_from_search(search_results_url)
|
|
|
|
|
mod_ids.extend(search_mod_ids)
|
|
|
|
|
|
|
|
|
|
# Download documents from given modification ids
|
|
|
|
|
for mod_id in tqdm(mod_ids):
|
|
|
|
|
get_documents(mod_id, output_dir)
|
|
|
|
|
# Get list of documents from given modification IDs
|
|
|
|
|
mod_pbar = tqdm(mod_ids)
|
|
|
|
|
for mod_id in mod_pbar:
|
|
|
|
|
document_data = get_document_list(mod_id, output_dir)
|
|
|
|
|
|
|
|
|
|
# Download documents for current modification
|
|
|
|
|
doc_pbar = tqdm(document_data)
|
|
|
|
|
|
|
|
|
|
for doc in doc_pbar:
|
|
|
|
|
# Update progress bars
|
|
|
|
|
mod_name = doc['document_path'].split(os.sep)[-3]
|
|
|
|
|
doc_name = doc['document_path'].split(os.sep)[-1]
|
|
|
|
|
mod_pbar.set_description(mod_name)
|
|
|
|
|
doc_pbar.set_description(doc_name)
|
|
|
|
|
|
|
|
|
|
# Download document
|
|
|
|
|
download_document(doc['url'], doc['document_path'])
|
|
|
|
|
|
|
|
|
|
# Tidy up console after tqdm
|
|
|
|
|
print('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|