Add progress bars

master
Dan Howe 6 years ago
parent f701e4212b
commit e9f99f54de

@ -17,15 +17,31 @@ import shutil
import logging import logging
import requests import requests
import argparse import argparse
import pandas as pd
from lxml import html from lxml import html
from tqdm import tqdm from tqdm import tqdm
from requests.exceptions import ConnectionError, InvalidURL from requests.exceptions import ConnectionError, InvalidURL
def get_documents(mod_id, output_dir): def mod_ids_from_search(search_results_url):
"""Download all documents from project modification ID """Get modification job IDs from search results URL"""
"""
# Get HTML of search results page
page = requests.get(search_results_url)
# Create HTML tree
tree = html.fromstring(page.content)
# Find job ids of items in results list
mod_ids = []
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
for mod in mods:
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
return mod_ids
def get_document_list(mod_id, output_dir):
"""Get list of documents from project modification ID"""
# Get html from mod page # Get html from mod page
mod_url = ('http://majorprojects.planning.nsw.gov.au/' mod_url = ('http://majorprojects.planning.nsw.gov.au/'
@ -58,65 +74,55 @@ def get_documents(mod_id, output_dir):
with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f: with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
f.write(text) f.write(text)
document_data = []
for folder in folders: for folder in folders:
folder_name = folder.xpath('a[2]')[0].text.strip() folder_name = folder.xpath('a[2]')[0].text.strip()
# Get documents in current folder # Get documents in current folder
documents = folder.xpath('ul/li/a') documents = folder.xpath('ul/li/a')
for document in documents: for document in documents:
document_url = document.get('href') doc = {}
document_name = document.text doc['url'] = document.get('href')
document_path = os.path.join(output_dir, project_name, mod_name, doc['name'] = document.text
folder_name, document_name) doc['document_path'] = os.path.join(
output_dir, project_name, mod_name, folder_name, doc['name'])
document_data.append(doc)
return document_data
def download_document(url, document_path):
"""Download document from given url"""
# Create output directories as required # Create output directories as required
try: try:
os.makedirs(os.path.dirname(document_path), exist_ok=True) os.makedirs(os.path.dirname(document_path), exist_ok=True)
except OSError: except OSError:
logging.error(('Failed to download {}\n' logging.error(('Failed to download {4}\n'
' Project: {}\n' ' Project: {1}\n'
' Modification: {}\n' ' Modification: {2}\n'
' Folder: {}\n').format( ' Folder: {3}\n').format(*document_path.split(os.sep)))
document_name, project_name, mod_name, return
folder_name))
continue # Check if file exists
# Download document, if it does not already exist
if os.path.isfile(document_path): if os.path.isfile(document_path):
pass pass
else: else:
try: try:
r = requests.get(document_url, stream=True) # Attempt to download file
r = requests.get(url, stream=True)
except (ConnectionError, InvalidURL): except (ConnectionError, InvalidURL):
logging.error(('Failed to download {}\n' logging.error(
' Project: {}\n' ('Failed to download {4}\n'
' Modification: {}\n' ' Project: {1}\n'
' Folder: {}\n').format( ' Modification: {2}\n'
document_name, project_name, mod_name, ' Folder: {3}\n').format(*document_path.split(os.sep)))
folder_name)) return
continue # Write file to disk
with open(document_path, 'wb') as f: with open(document_path, 'wb') as f:
shutil.copyfileobj(r.raw, f) shutil.copyfileobj(r.raw, f)
def mod_ids_from_search(search_results_url):
"""Get modification job IDs from search results URL"""
# Get HTML of search results page
page = requests.get(search_results_url)
# Create HTML tree
tree = html.fromstring(page.content)
# Find job ids of items in results list
mod_ids = []
mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
for mod in mods:
mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
return mod_ids
def main(): def main():
example_text = """examples: example_text = """examples:
@ -162,9 +168,26 @@ def main():
search_mod_ids = mod_ids_from_search(search_results_url) search_mod_ids = mod_ids_from_search(search_results_url)
mod_ids.extend(search_mod_ids) mod_ids.extend(search_mod_ids)
# Download documents from given modification ids # Get list of documents from given modification IDs
for mod_id in tqdm(mod_ids): mod_pbar = tqdm(mod_ids)
get_documents(mod_id, output_dir) for mod_id in mod_pbar:
document_data = get_document_list(mod_id, output_dir)
# Download documents for current modification
doc_pbar = tqdm(document_data)
for doc in doc_pbar:
# Update progress bars
mod_name = doc['document_path'].split(os.sep)[-3]
doc_name = doc['document_path'].split(os.sep)[-1]
mod_pbar.set_description(mod_name)
doc_pbar.set_description(doc_name)
# Download document
download_document(doc['url'], doc['document_path'])
# Tidy up console after tqdm
print('\n')
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save