TheBobBob
/

BioModelsRAG

Model card Files Files and versions Community

BioModelsRAG / selectBioModels.py

TheBobBob's picture

Upload core files

03a7adf verified 9 months ago

3.75 kB

	import os
	import re
	import pandas as pd
	import shutil

	# Function to search BioModels and create the CSV file
	def search_biomodels(directory, keywords, output_file):
	biomodel_numbers_list = []
	matching_biomodels = []

	files = os.listdir(directory)

	for file in files:
	file_path = os.path.join(directory, file)

	try:
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	file_content = f.read()

	# Find all biomodel numbers using a more flexible regex
	biomodel_numbers = re.findall(r'biomodels\.db/(\w+)', file_content)

	# Search for the biomodel name, case-insensitive, and allow variations
	biomodel_name_match = re.search(rf'{re.escape(keywords[0])} is "([^"]+)"', file_content, re.IGNORECASE)
	biomodel_name = biomodel_name_match.group(1) if biomodel_name_match else ''

	def matches_keywords(name, keywords):
	# Check for any keyword match in the biomodel name, case-insensitive
	return any(keyword.lower() in name.lower() for keyword in keywords)

	# If a matching biomodel name is found, save it
	if biomodel_name and matches_keywords(biomodel_name, keywords):
	biomodel_numbers_list.extend(biomodel_numbers)
	matching_biomodels.extend([biomodel_name] * len(biomodel_numbers))

	except Exception as e:
	print(f"Error processing file {file_path}: {e}")

	# Create a DataFrame from the collected data
	df = pd.DataFrame({
	'Biomodel Number': biomodel_numbers_list,
	'Biomodel Name': [matching_biomodels[i] if i < len(matching_biomodels) else '' for i in range(len(biomodel_numbers_list))]
	})

	# Save the DataFrame to a CSV file
	df.to_csv(output_file, index=False)
	print(f"Data saved to {output_file}")

	# Function to copy matching files to final_models directory
	def copy_matching_files(csv_file, data_folder, final_models_folder):
	# Create the final_models folder if it doesn't exist
	os.makedirs(final_models_folder, exist_ok=True)

	# Load the CSV file into a DataFrame
	df = pd.read_csv(csv_file)

	# Iterate through the data folder to find and copy matching files
	for root, dirs, files in os.walk(data_folder):
	for file in files:
	file_path = os.path.join(root, file)
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	content = f.read()
	# Check if any biomodel name or number is in the file
	for i, row in df.iterrows():
	biomodel_number = row['Biomodel Number']
	biomodel_name = row['Biomodel Name']
	if (biomodel_name and biomodel_name.lower() in content.lower()) or biomodel_number in content:
	shutil.copy(file_path, final_models_folder)
	print(f"Copied: {file} to final_models")

	print(f"All matching biomodel files have been copied to {final_models_folder}")

	# Main execution
	directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
	output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
	final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
	user_keywords = input("Keyword you would like to search for: ").split()

	# Search and copy files
	search_biomodels(directory, user_keywords, output_file)
	copy_matching_files(output_file, directory, final_models_folder)