|
import os
|
|
import re
|
|
import pandas as pd
|
|
import shutil
|
|
|
|
|
|
def search_biomodels(directory, keywords, output_file):
|
|
biomodel_numbers_list = []
|
|
matching_biomodels = []
|
|
|
|
files = os.listdir(directory)
|
|
|
|
for file in files:
|
|
file_path = os.path.join(directory, file)
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
file_content = f.read()
|
|
|
|
|
|
biomodel_numbers = re.findall(r'biomodels\.db/(\w+)', file_content)
|
|
|
|
|
|
biomodel_name_match = re.search(rf'{re.escape(keywords[0])} is "([^"]+)"', file_content, re.IGNORECASE)
|
|
biomodel_name = biomodel_name_match.group(1) if biomodel_name_match else ''
|
|
|
|
def matches_keywords(name, keywords):
|
|
|
|
return any(keyword.lower() in name.lower() for keyword in keywords)
|
|
|
|
|
|
if biomodel_name and matches_keywords(biomodel_name, keywords):
|
|
biomodel_numbers_list.extend(biomodel_numbers)
|
|
matching_biomodels.extend([biomodel_name] * len(biomodel_numbers))
|
|
|
|
except Exception as e:
|
|
print(f"Error processing file {file_path}: {e}")
|
|
|
|
|
|
df = pd.DataFrame({
|
|
'Biomodel Number': biomodel_numbers_list,
|
|
'Biomodel Name': [matching_biomodels[i] if i < len(matching_biomodels) else '' for i in range(len(biomodel_numbers_list))]
|
|
})
|
|
|
|
|
|
df.to_csv(output_file, index=False)
|
|
print(f"Data saved to {output_file}")
|
|
|
|
|
|
def copy_matching_files(csv_file, data_folder, final_models_folder):
|
|
|
|
os.makedirs(final_models_folder, exist_ok=True)
|
|
|
|
|
|
df = pd.read_csv(csv_file)
|
|
|
|
|
|
for root, dirs, files in os.walk(data_folder):
|
|
for file in files:
|
|
file_path = os.path.join(root, file)
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
for i, row in df.iterrows():
|
|
biomodel_number = row['Biomodel Number']
|
|
biomodel_name = row['Biomodel Name']
|
|
if (biomodel_name and biomodel_name.lower() in content.lower()) or biomodel_number in content:
|
|
shutil.copy(file_path, final_models_folder)
|
|
print(f"Copied: {file} to final_models")
|
|
|
|
print(f"All matching biomodel files have been copied to {final_models_folder}")
|
|
|
|
|
|
directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
|
|
output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
|
|
final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
|
|
user_keywords = input("Keyword you would like to search for: ").split()
|
|
|
|
|
|
search_biomodels(directory, user_keywords, output_file)
|
|
copy_matching_files(output_file, directory, final_models_folder)
|
|
|