import os import re import pandas as pd import shutil # Function to search BioModels and create the CSV file def search_biomodels(directory, keywords, output_file): biomodel_numbers_list = [] matching_biomodels = [] files = os.listdir(directory) for file in files: file_path = os.path.join(directory, file) try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: file_content = f.read() # Find all biomodel numbers using a more flexible regex biomodel_numbers = re.findall(r'biomodels\.db/(\w+)', file_content) # Search for the biomodel name, case-insensitive, and allow variations biomodel_name_match = re.search(rf'{re.escape(keywords[0])} is "([^"]+)"', file_content, re.IGNORECASE) biomodel_name = biomodel_name_match.group(1) if biomodel_name_match else '' def matches_keywords(name, keywords): # Check for any keyword match in the biomodel name, case-insensitive return any(keyword.lower() in name.lower() for keyword in keywords) # If a matching biomodel name is found, save it if biomodel_name and matches_keywords(biomodel_name, keywords): biomodel_numbers_list.extend(biomodel_numbers) matching_biomodels.extend([biomodel_name] * len(biomodel_numbers)) except Exception as e: print(f"Error processing file {file_path}: {e}") # Create a DataFrame from the collected data df = pd.DataFrame({ 'Biomodel Number': biomodel_numbers_list, 'Biomodel Name': [matching_biomodels[i] if i < len(matching_biomodels) else '' for i in range(len(biomodel_numbers_list))] }) # Save the DataFrame to a CSV file df.to_csv(output_file, index=False) print(f"Data saved to {output_file}") # Function to copy matching files to final_models directory def copy_matching_files(csv_file, data_folder, final_models_folder): # Create the final_models folder if it doesn't exist os.makedirs(final_models_folder, exist_ok=True) # Load the CSV file into a DataFrame df = pd.read_csv(csv_file) # Iterate through the data folder to find and copy matching files for root, dirs, files in os.walk(data_folder): for file in files: file_path = os.path.join(root, file) with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # Check if any biomodel name or number is in the file for i, row in df.iterrows(): biomodel_number = row['Biomodel Number'] biomodel_name = row['Biomodel Name'] if (biomodel_name and biomodel_name.lower() in content.lower()) or biomodel_number in content: shutil.copy(file_path, final_models_folder) print(f"Copied: {file} to final_models") print(f"All matching biomodel files have been copied to {final_models_folder}") # Main execution directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data' output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv' final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models' user_keywords = input("Keyword you would like to search for: ").split() # Search and copy files search_biomodels(directory, user_keywords, output_file) copy_matching_files(output_file, directory, final_models_folder)