PharmaBot / dataFetch.py
alperensn's picture
Upload files
25fcb73 verified
import requests
import json
import os
import math
# Define the API endpoint
API_URL = "https://api.fda.gov/drug/label.json"
# Define the output directory and file for all data
OUTPUT_DIR = "fda_data"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "drug_labels_all.json")
# The API's maximum limit per request is 1000
CHUNK_SIZE = 1000
MAX_RECORDS = 25000
def fetch_all_fda_data():
"""
Fetches drug label data from the openFDA API using pagination
and saves it to a single file.
"""
print("Starting to fetch data from the openFDA endpoint...")
try:
# Step 1: Make an initial request to get the total number of records
print("Determining the total number of records...")
initial_response = requests.get(API_URL, params={"limit": 1})
initial_response.raise_for_status()
total_records = initial_response.json()['meta']['results']['total']
records_to_fetch = min(total_records, MAX_RECORDS)
print(f"Found a total of {total_records} records. Fetching up to {records_to_fetch} records.")
all_results = []
# Step 2: Loop through the data in chunks
num_chunks = math.ceil(records_to_fetch / CHUNK_SIZE)
for i in range(num_chunks):
skip = i * CHUNK_SIZE
# Ensure we don't request more than records_to_fetch
limit = min(CHUNK_SIZE, records_to_fetch - skip)
if limit <= 0:
break
params = {"limit": limit, "skip": skip}
print(f"Fetching chunk {i+1}/{num_chunks} (records {skip} to {skip + limit - 1})...")
response = requests.get(API_URL, params=params)
response.raise_for_status()
chunk_data = response.json()
if 'results' in chunk_data:
all_results.extend(chunk_data['results'])
print("\nAll data has been fetched successfully.")
# Step 3: Save all the data to a single file
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f"Created directory: {OUTPUT_DIR}")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump({"results": all_results}, f, ensure_ascii=False, indent=4)
print(f"All {len(all_results)} records saved to: {OUTPUT_FILE}")
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except requests.exceptions.RequestException as req_err:
print(f"An error occurred while fetching data: {req_err}")
except json.JSONDecodeError:
print("Failed to parse the response as JSON.")
except Exception as e:
print(f"An unexpected error occurred: {e}")
if __name__ == "__main__":
fetch_all_fda_data()