diff --git a/major_projects_grabber/major_projects_grabber.py b/major_projects_grabber/major_projects_grabber.py index d449d64..3d11040 100644 --- a/major_projects_grabber/major_projects_grabber.py +++ b/major_projects_grabber/major_projects_grabber.py @@ -17,15 +17,31 @@ import shutil import logging import requests import argparse -import pandas as pd from lxml import html from tqdm import tqdm from requests.exceptions import ConnectionError, InvalidURL -def get_documents(mod_id, output_dir): - """Download all documents from project modification ID - """ +def mod_ids_from_search(search_results_url): + """Get modification job IDs from search results URL""" + + # Get HTML of search results page + page = requests.get(search_results_url) + + # Create HTML tree + tree = html.fromstring(page.content) + + # Find job ids of items in results list + mod_ids = [] + mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a') + for mod in mods: + mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group()) + + return mod_ids + + +def get_document_list(mod_id, output_dir): + """Get list of documents from project modification ID""" # Get html from mod page mod_url = ('http://majorprojects.planning.nsw.gov.au/' @@ -58,63 +74,53 @@ def get_documents(mod_id, output_dir): with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f: f.write(text) + document_data = [] for folder in folders: folder_name = folder.xpath('a[2]')[0].text.strip() # Get documents in current folder documents = folder.xpath('ul/li/a') for document in documents: - document_url = document.get('href') - document_name = document.text - document_path = os.path.join(output_dir, project_name, mod_name, - folder_name, document_name) - - # Create output directories as required - try: - os.makedirs(os.path.dirname(document_path), exist_ok=True) - except OSError: - logging.error(('Failed to download {}\n' - ' Project: {}\n' - ' Modification: {}\n' - ' Folder: {}\n').format( - document_name, project_name, mod_name, - folder_name)) - continue - - # Download document, if it does not already exist - if os.path.isfile(document_path): - pass - else: - try: - r = requests.get(document_url, stream=True) - except (ConnectionError, InvalidURL): - logging.error(('Failed to download {}\n' - ' Project: {}\n' - ' Modification: {}\n' - ' Folder: {}\n').format( - document_name, project_name, mod_name, - folder_name)) - continue - with open(document_path, 'wb') as f: - shutil.copyfileobj(r.raw, f) - - -def mod_ids_from_search(search_results_url): - """Get modification job IDs from search results URL""" - - # Get HTML of search results page - page = requests.get(search_results_url) - - # Create HTML tree - tree = html.fromstring(page.content) - - # Find job ids of items in results list - mod_ids = [] - mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a') - for mod in mods: - mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group()) - - return mod_ids + doc = {} + doc['url'] = document.get('href') + doc['name'] = document.text + doc['document_path'] = os.path.join( + output_dir, project_name, mod_name, folder_name, doc['name']) + document_data.append(doc) + + return document_data + + +def download_document(url, document_path): + """Download document from given url""" + + # Create output directories as required + try: + os.makedirs(os.path.dirname(document_path), exist_ok=True) + except OSError: + logging.error(('Failed to download {4}\n' + ' Project: {1}\n' + ' Modification: {2}\n' + ' Folder: {3}\n').format(*document_path.split(os.sep))) + return + + # Check if file exists + if os.path.isfile(document_path): + pass + else: + try: + # Attempt to download file + r = requests.get(url, stream=True) + except (ConnectionError, InvalidURL): + logging.error( + ('Failed to download {4}\n' + ' Project: {1}\n' + ' Modification: {2}\n' + ' Folder: {3}\n').format(*document_path.split(os.sep))) + return + # Write file to disk + with open(document_path, 'wb') as f: + shutil.copyfileobj(r.raw, f) def main(): @@ -162,9 +168,26 @@ def main(): search_mod_ids = mod_ids_from_search(search_results_url) mod_ids.extend(search_mod_ids) - # Download documents from given modification ids - for mod_id in tqdm(mod_ids): - get_documents(mod_id, output_dir) + # Get list of documents from given modification IDs + mod_pbar = tqdm(mod_ids) + for mod_id in mod_pbar: + document_data = get_document_list(mod_id, output_dir) + + # Download documents for current modification + doc_pbar = tqdm(document_data) + + for doc in doc_pbar: + # Update progress bars + mod_name = doc['document_path'].split(os.sep)[-3] + doc_name = doc['document_path'].split(os.sep)[-1] + mod_pbar.set_description(mod_name) + doc_pbar.set_description(doc_name) + + # Download document + download_document(doc['url'], doc['document_path']) + + # Tidy up console after tqdm + print('\n') if __name__ == '__main__':