Add progress bars

master
Dan Howe 6 years ago
parent f701e4212b
commit e9f99f54de

@ -17,15 +17,31 @@ import shutil
import logging import logging
import requests import requests
import argparse import argparse
import pandas as pd
from lxml import html from lxml import html
from tqdm import tqdm from tqdm import tqdm
from requests.exceptions import ConnectionError, InvalidURL from requests.exceptions import ConnectionError, InvalidURL
def get_documents(mod_id, output_dir): def mod_ids_from_search(search_results_url):
"""Download all documents from project modification ID """Get modification job IDs from search results URL"""
"""
# Get HTML of search results page
page = requests.get(search_results_url)
# Create HTML tree
tree = html.fromstring(page.content)
# Find job ids of items in results list
mod_ids = []
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
for mod in mods:
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
return mod_ids
def get_document_list(mod_id, output_dir):
"""Get list of documents from project modification ID"""
# Get html from mod page # Get html from mod page
mod_url = ('http://majorprojects.planning.nsw.gov.au/' mod_url = ('http://majorprojects.planning.nsw.gov.au/'
@ -58,63 +74,53 @@ def get_documents(mod_id, output_dir):
with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f: with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
f.write(text) f.write(text)
document_data = []
for folder in folders: for folder in folders:
folder_name = folder.xpath('a[2]')[0].text.strip() folder_name = folder.xpath('a[2]')[0].text.strip()
# Get documents in current folder # Get documents in current folder
documents = folder.xpath('ul/li/a') documents = folder.xpath('ul/li/a')
for document in documents: for document in documents:
document_url = document.get('href') doc = {}
document_name = document.text doc['url'] = document.get('href')
document_path = os.path.join(output_dir, project_name, mod_name, doc['name'] = document.text
folder_name, document_name) doc['document_path'] = os.path.join(
output_dir, project_name, mod_name, folder_name, doc['name'])
# Create output directories as required document_data.append(doc)
try:
os.makedirs(os.path.dirname(document_path), exist_ok=True) return document_data
except OSError:
logging.error(('Failed to download {}\n'
' Project: {}\n' def download_document(url, document_path):
' Modification: {}\n' """Download document from given url"""
' Folder: {}\n').format(
document_name, project_name, mod_name, # Create output directories as required
folder_name)) try:
continue os.makedirs(os.path.dirname(document_path), exist_ok=True)
except OSError:
# Download document, if it does not already exist logging.error(('Failed to download {4}\n'
if os.path.isfile(document_path): ' Project: {1}\n'
pass ' Modification: {2}\n'
else: ' Folder: {3}\n').format(*document_path.split(os.sep)))
try: return
r = requests.get(document_url, stream=True)
except (ConnectionError, InvalidURL): # Check if file exists
logging.error(('Failed to download {}\n' if os.path.isfile(document_path):
' Project: {}\n' pass
' Modification: {}\n' else:
' Folder: {}\n').format( try:
document_name, project_name, mod_name, # Attempt to download file
folder_name)) r = requests.get(url, stream=True)
continue except (ConnectionError, InvalidURL):
with open(document_path, 'wb') as f: logging.error(
shutil.copyfileobj(r.raw, f) ('Failed to download {4}\n'
' Project: {1}\n'
' Modification: {2}\n'
def mod_ids_from_search(search_results_url): ' Folder: {3}\n').format(*document_path.split(os.sep)))
"""Get modification job IDs from search results URL""" return
# Write file to disk
# Get HTML of search results page with open(document_path, 'wb') as f:
page = requests.get(search_results_url) shutil.copyfileobj(r.raw, f)
# Create HTML tree
tree = html.fromstring(page.content)
# Find job ids of items in results list
mod_ids = []
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
for mod in mods:
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
return mod_ids
def main(): def main():
@ -162,9 +168,26 @@ def main():
search_mod_ids = mod_ids_from_search(search_results_url) search_mod_ids = mod_ids_from_search(search_results_url)
mod_ids.extend(search_mod_ids) mod_ids.extend(search_mod_ids)
# Download documents from given modification ids # Get list of documents from given modification IDs
for mod_id in tqdm(mod_ids): mod_pbar = tqdm(mod_ids)
get_documents(mod_id, output_dir) for mod_id in mod_pbar:
document_data = get_document_list(mod_id, output_dir)
# Download documents for current modification
doc_pbar = tqdm(document_data)
for doc in doc_pbar:
# Update progress bars
mod_name = doc['document_path'].split(os.sep)[-3]
doc_name = doc['document_path'].split(os.sep)[-1]
mod_pbar.set_description(mod_name)
doc_pbar.set_description(doc_name)
# Download document
download_document(doc['url'], doc['document_path'])
# Tidy up console after tqdm
print('\n')
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save