How to Download Files from Urls With Python?

Best practices

Use the `requests` library for downloading files as it provides more control over requests and responses, including error handling and session management.
Always check the `status_code` of the response object to ensure the HTTP request was successful before proceeding with file operations.
When downloading large files, use the `stream=True` parameter in `requests.get()` to download the content in chunks, preventing large memory usage.
Consider using the `tqdm` library to add a progress bar when downloading files, which improves the user experience by providing visual feedback on the download progress.

import requests  # Import requests library
# Download file using requests
url = "https://sandbox.oxylabs.io/products/sample.pdf"
response = requests.get(url)
with open("sample.pdf", "wb") as file:
    file.write(response.content)
import urllib.request  # Import urllib library for another method
# Download file using urllib
urllib.request.urlretrieve(url, "sample_urllib.pdf")
try:
    from tqdm import tqdm  # Import tqdm for progress bar (optional)
    # Download with progress bar
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with open("sample_with_progress.pdf", "wb") as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
except ImportError:
    print("tqdm library is not installed. Install it to see the progress bar.")

Common issues

Ensure you handle exceptions such as `ConnectionError` or `Timeout` when using `requests.get()` to maintain robustness in network-related failures.
Check the 'content-length' header against the downloaded file size to prevent incomplete or corrupted downloads
Set a timeout in `requests.get()` to avoid hanging indefinitely if the server does not respond or is too slow.
Use `os.path` to dynamically set the file path and name, ensuring compatibility across different operating systems.

# Good Example: Handling exceptions with requests.get()
try:
    response = requests.get(url, timeout=10)  # Set timeout
    response.raise_for_status()  # Check for HTTP errors
except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")

# Bad Example: No exception handling or timeout
response = requests.get(url)
with open("sample.pdf", "wb") as file:
    file.write(response.content)

# Good Example: Validate 'content-length' before downloading
response = requests.get(url, stream=True)
content_length = response.headers.get('content-length')
total_data = bytearray()
if content_length:
    for chunk in response.iter_content(1024):
        total_data.extend(chunk)
    if len(total_data) == int(content_length):
        with open("validated_file.pdf", "wb") as file:
            file.write(total_data)
    else:
        print("Content length mismatch.")

# Bad Example: Ignoring 'content-length' validation
response = requests.get(url)
with open("unvalidated_file.pdf", "wb") as file:
    file.write(response.content)

# Good Example: Using os.path for file paths
import os
filename = os.path.join(os.getcwd(), "downloaded_file.pdf")
response = requests.get(url)
with open(filename, "wb") as file:
    file.write(response.content)

# Bad Example: Hardcoding file paths
response = requests.get(url)
with open("/absolute/path/downloaded_file.pdf", "wb") as file:
    file.write(response.content)

# Good Example: Setting a timeout in requests.get()
try:
    response = requests.get(url, timeout=5)  # Timeout after 5 seconds
    with open("timed_file.pdf", "wb") as file:
        file.write(response.content)
except requests.Timeout:
    print("The request timed out.")

# Bad Example: No timeout set
response = requests.get(url)
with open("no_timeout_file.pdf", "wb") as file:
    file.write(response.content)