diff --git a/major_projects_grabber/major_projects_grabber.py b/major_projects_grabber/major_projects_grabber.py index 0bf4489..f6c12e0 100644 --- a/major_projects_grabber/major_projects_grabber.py +++ b/major_projects_grabber/major_projects_grabber.py @@ -22,6 +22,18 @@ from tqdm import tqdm from requests.exceptions import ConnectionError, InvalidURL +def make_safe(s): + """Remove characters that would be invalid in a filepath""" + # Remove '\', '*', '"', '<', '>' '|' + s_safe = re.sub('\\\|\*|"|<|>\|', '', s) + + # Replace '/' and ':' with '-' + s_safe = re.sub(':', ' -', s_safe) + s_safe = re.sub('/', '-', s_safe) + + return s_safe + + def mod_ids_from_search(search_results_url): """Get modification job IDs from search results URL""" @@ -58,6 +70,10 @@ def get_document_list(mod_id, output_dir): # Get list of document folders folders = mod_tree.xpath('//div[@class="folder_row"]') + # Remove invalid characters before creating folders + project_name = make_safe(project_name) + mod_name = make_safe(mod_name) + # Create modification folder mod_dir = os.path.join(output_dir, project_name, mod_name) os.makedirs(mod_dir, exist_ok=True) @@ -94,15 +110,14 @@ def get_document_list(mod_id, output_dir): def download_document(url, document_path): """Download document from given url""" - # Create output directories as required + # Check if destination path is too long (Windows filename limitation) try: - os.makedirs(os.path.dirname(document_path), exist_ok=True) - except OSError: - logging.error(('Failed to download {4}\n' - ' Project: {1}\n' - ' Modification: {2}\n' - ' Folder: {3}\n').format(*document_path.split(os.sep))) - return + open(document_path, 'a').close() + except FileNotFoundError: + document_path = '\\\\?\\' + os.path.abspath(document_path) + + # Create output directories as required + os.makedirs(os.path.dirname(document_path), exist_ok=True) # Check if file exists if os.path.isfile(document_path): @@ -119,12 +134,6 @@ def download_document(url, document_path): ' Folder: {3}\n').format(*document_path.split(os.sep))) return - # Check if destination path is too long (Windows filename limitation) - try: - open(document_path, 'a').close() - except FileNotFoundError: - document_path = '\\\\?\\' + os.path.abspath(document_path) - # Write file to disk with open(document_path, 'wb') as f: shutil.copyfileobj(r.raw, f) @@ -191,6 +200,7 @@ def main(): doc_pbar.set_description(doc_name) # Download document + print(doc) download_document(doc['url'], doc['document_path']) # Tidy up console after tqdm