import pandas as pd dataset_1= pd.read_csv("training_data.csv") #dataset_1 #for i in dataset_1.columns: #print(i) # Create a new column with merged column names where value is 1 dataset_1['symptoms_text'] = dataset_1.apply(lambda row: ','.join([col for col in dataset_1.columns if row[col] == 1]), axis=1) #print("Original DataFrame:") #print(dataset_1) #dataset_1.to_csv("training_data_after_changes.csv") final_dataset = pd.DataFrame(dataset_1[["prognosis","symptoms_text"]]) final_dataset.columns = ['label', 'text'] #final_dataset.to_csv("final_dataset.csv") #final_dataset ##############3 import pandas as pd dataset_2= pd.read_csv("Symptom2Disease.csv") dataset_2 = dataset_2[["label","text"]] #dataset_2 ################# df_combined = pd.concat([final_dataset, dataset_2], axis=0, ignore_index=True) #df_combined ################ import nltk nltk.download('stopwords') import pandas as pd import re import string from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # Download necessary NLTK data files nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') def preprocess_text(text): # Convert to lowercase text = text.lower() cleaned_text = re.sub(r'[^a-zA-Z0-9\s\,]', ' ', text) # Tokenize text tokens = word_tokenize(cleaned_text) # Remove stop words stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Rejoin tokens into a single string cleaned_text = ' '.join(tokens) return cleaned_text df_combined["cleaned_text"] = df_combined["text"].apply(preprocess_text) #print(df_combined) ########### #df_combined.to_csv("final_dataset_llms.csv") ########### import pandas as pd from sklearn.feature_extraction.text import CountVectorizer print("scikit-learn imported successfully!") from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report # Load your dataset data = pd.read_csv('final_dataset_llms.csv') # Replace with your file path # Example columns: 'symptoms' and 'label' X = data['cleaned_text'] y = data['label'] # Convert text data to numerical data vectorizer = CountVectorizer() X_vectorized = vectorizer.fit_transform(X) # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42) # Train the model model = LogisticRegression() model.fit(X_train, y_train) # Make predictions y_pred = model.predict(X_test) # Evaluate the model accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy:.2f}') print('Classification Report:') print(classification_report(y_test, y_pred)) ########################pip ######################### ############################### ########################################### data['label'].nunique() ############################################# def precaution(label): dataset_precau = pd.read_csv("disease_precaution.csv", encoding='latin1') label = str(label) label = label.lower() dataset_precau["Disease"] = dataset_precau["Disease"].str.lower() # Filter the DataFrame for the given label filtered_precautions = dataset_precau[dataset_precau["Disease"] == label] # Check if any precautions were found if not filtered_precautions.empty: # Extract precaution columns precautions = filtered_precautions[["Precaution_1", "Precaution_2", "Precaution_3", "Precaution_4"]] return precautions.values.tolist() # Convert DataFrame to a list of lists else: return [] # Return an empty list if no matching label is found def occurance(label): dataset_occur = pd.read_csv("disease_riskFactors.csv", encoding='latin1') label = str(label) label = label.lower() dataset_occur["DNAME"] = dataset_occur["DNAME"].str.lower() # Filter the DataFrame for the given label filtered_occurrence = dataset_occur[dataset_occur["DNAME"] == label] # Check if any occurrences were found if not filtered_occurrence.empty: occurrences = filtered_occurrence["OCCUR"].tolist() # Convert Series to list return occurrences else: return [] # Return an empty list if no matching label is found ################################################################################ import streamlit as st import numpy as np import sklearn from sklearn.feature_extraction.text import CountVectorizer st.title("SYMPTOMS DETECTION, PRECAUTION n OCCURANCE") symptoms = st.text_area("Enter your symptoms (comma-separated):") if symptoms.lower() != "exit": # Convert input string to a list of symptoms # Function to predict new symptoms def predict_symptoms(new_symptoms): preprocessed_text = preprocess_text(new_symptoms) if isinstance(preprocessed_text, str): new_symptoms = [preprocessed_text] # Vectorize the new symptoms new_symptoms_vectorized = vectorizer.transform(new_symptoms) # Make predictions prediction = model.predict(new_symptoms_vectorized) return prediction st.write("disease :") symptoms_list = [symptom.strip() for symptom in symptoms.split(',')] # Predict symptoms prediction = predict_symptoms(' '.join(symptoms_list)) st.write(prediction[0]) # Extract the string from the numpy array # Display precautions st.write("Precautions:") precautions = precaution(prediction[0]) # Pass the string, not the array if precautions: for precaution_list in precautions: for precaution_item in precaution_list: if precaution_item: # Check if the item is not None or empty st.write(f"- {precaution_item}") else: st.write("No precautions found for this disease.") # Display occurrences st.write("Occurrence:") occurrences = occurance(prediction[0]) # Pass the string, not the array if occurrences: for occurrence in occurrences: st.write(f"- {occurrence}") else: st.write("No occurrence information found for this disease.") else: st.write("Please enter symptoms to get the disease.") # Get user input # Make a prediction