Spaces:
Sleeping
Sleeping
File size: 7,791 Bytes
ce4a4e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import os
import pandas as pd
#from llama_index.llms.replicate import Replicate
import requests
import re
def extract_japan_cities(text):
# ζ£θ¦θ‘¨ηΎγδ½Ώη¨γγ¦ " - Japan" γ§η΅γγι½εΈεγζ½εΊ
pattern = r'(\b\w+\s*\w*\b) - Japan'
cities = re.findall(pattern, text)
unique_cities = list(set(cities))
# γ¦γγΌγ―γͺι½εΈεγγ½γΌγγγ¦γ«γ³γγ§εΊεγγγζεεγ«ε€ζ
unique_cities.sort()
return ', '.join(unique_cities)
def fetch_clinical_trials(cancer_name):
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
"query.titles": search_expr,
"pageSize": 100
}
# Initialize an empty list to store the data
data_list = []
# Loop until there is no nextPageToken
while True:
# Print the current URL (for debugging purposes)
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
# Send a GET request to the API
response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
data = response.json() # Parse JSON response
studies = data.get('studies', []) # Extract the list of studies
# Loop through each study and extract specific information
for study in studies:
# Safely access nested keys
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
# Extract locations safely
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
JapanesLocations = extract_japan_cities(locations)
# Extract dates and phases
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
# Append the data to the list as a dictionary
data_list.append({
"NCTID": nctId,
"Title": title,
#"Start Date": startDate,
"Primary Completion Date": primaryCompletionDate,
#"Conditions": conditions,
"Cancer": conditions,
"Summary": summary,
"Japanes Locations": JapanesLocations,
#"Phases": phases,
"Eligibility Criteria": eligibilityCriteria
})
# Check for nextPageToken and update the params or break the loop
nextPageToken = data.get('nextPageToken')
if nextPageToken:
params['pageToken'] = nextPageToken # Set the pageToken for the next request
else:
break # Exit the loop if no nextPageToken is present
else:
print("Failed to fetch data. Status code:", response.status_code)
break
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
return df
def fetch_clinical_trials_jp(cancer_name):
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
"query.titles": search_expr,
"pageSize": 100
}
# Initialize an empty list to store the data
data_list = []
# Loop until there is no nextPageToken
while True:
# Print the current URL (for debugging purposes)
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
# Send a GET request to the API
response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
data = response.json() # Parse JSON response
studies = data.get('studies', []) # Extract the list of studies
# Loop through each study and extract specific information
for study in studies:
# Safely access nested keys
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
# Extract locations safely
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
JapanesLocations = extract_japan_cities(locations)
# Extract dates and phases
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
# Append the data to the list as a dictionary
data_list.append({
"NCTID": nctId,
"γΏγ€γγ«": title,
#"Start Date": startDate,
#"Primary Completion Date": primaryCompletionDate,
"対豑γ¨γͺγη": conditions,
"γ΅γγͺγΌ": summary,
"ε ΄ζ": JapanesLocations,
#"Phases": phases,
"γ―γ©γ€γγͺγ’": eligibilityCriteria
})
# Check for nextPageToken and update the params or break the loop
nextPageToken = data.get('nextPageToken')
if nextPageToken:
params['pageToken'] = nextPageToken # Set the pageToken for the next request
else:
break # Exit the loop if no nextPageToken is present
else:
print("Failed to fetch data. Status code:", response.status_code)
break
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
return df
|