import os import pandas as pd #from llama_index.llms.replicate import Replicate import requests import re def extract_japan_cities(text): # 正規表現を使用して " - Japan" で終わる都市名を抽出 pattern = r'(\b\w+\s*\w*\b) - Japan' cities = re.findall(pattern, text) unique_cities = list(set(cities)) # ユニークな都市名をソートしてカンマで区切られた文字列に変換 unique_cities.sort() return ', '.join(unique_cities) def fetch_clinical_trials(cancer_name): search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name) # Initial URL for the first API call base_url = "https://clinicaltrials.gov/api/v2/studies" params = { "query.titles": search_expr, "pageSize": 100 } # Initialize an empty list to store the data data_list = [] # Loop until there is no nextPageToken while True: # Print the current URL (for debugging purposes) print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()])) # Send a GET request to the API response = requests.get(base_url, params=params) # Check if the request was successful if response.status_code == 200: data = response.json() # Parse JSON response studies = data.get('studies', []) # Extract the list of studies # Loop through each study and extract specific information for study in studies: # Safely access nested keys nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown') startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date') conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed'])) title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title') summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary') # Extract locations safely locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', []) locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed" JapanesLocations = extract_japan_cities(locations) # Extract dates and phases primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date') phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available'])) eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown') # Append the data to the list as a dictionary data_list.append({ "NCTID": nctId, "Title": title, #"Start Date": startDate, "Primary Completion Date": primaryCompletionDate, #"Conditions": conditions, "Cancer": conditions, "Summary": summary, "Japanes Locations": JapanesLocations, #"Phases": phases, "Eligibility Criteria": eligibilityCriteria }) # Check for nextPageToken and update the params or break the loop nextPageToken = data.get('nextPageToken') if nextPageToken: params['pageToken'] = nextPageToken # Set the pageToken for the next request else: break # Exit the loop if no nextPageToken is present else: print("Failed to fetch data. Status code:", response.status_code) break # Create a DataFrame from the list of dictionaries df = pd.DataFrame(data_list) return df def fetch_clinical_trials_jp(cancer_name): search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name) # Initial URL for the first API call base_url = "https://clinicaltrials.gov/api/v2/studies" params = { "query.titles": search_expr, "pageSize": 100 } # Initialize an empty list to store the data data_list = [] # Loop until there is no nextPageToken while True: # Print the current URL (for debugging purposes) print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()])) # Send a GET request to the API response = requests.get(base_url, params=params) # Check if the request was successful if response.status_code == 200: data = response.json() # Parse JSON response studies = data.get('studies', []) # Extract the list of studies # Loop through each study and extract specific information for study in studies: # Safely access nested keys nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown') startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date') conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed'])) title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title') summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary') # Extract locations safely locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', []) locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed" JapanesLocations = extract_japan_cities(locations) # Extract dates and phases primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date') phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available'])) eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown') # Append the data to the list as a dictionary data_list.append({ "NCTID": nctId, "タイトル": title, #"Start Date": startDate, #"Primary Completion Date": primaryCompletionDate, "対象となる癌": conditions, "サマリー": summary, "場所": JapanesLocations, #"Phases": phases, "クライテリア": eligibilityCriteria }) # Check for nextPageToken and update the params or break the loop nextPageToken = data.get('nextPageToken') if nextPageToken: params['pageToken'] = nextPageToken # Set the pageToken for the next request else: break # Exit the loop if no nextPageToken is present else: print("Failed to fetch data. Status code:", response.status_code) break # Create a DataFrame from the list of dictionaries df = pd.DataFrame(data_list) return df