NovaScholar / file_upload_vectorize.py
omkar-surve126's picture
Upload 38 files
b91146d verified
from pymongo import MongoClient
from datetime import datetime
import openai
import google.generativeai as genai
import streamlit as st
from db import courses_collection2, faculty_collection, students_collection, vectors_collection
from PIL import Image
import PyPDF2, docx, io
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from bson import ObjectId
from dotenv import load_dotenv
import os
from create_course import courses_collection
load_dotenv()
MONGO_URI = os.getenv('MONGO_URI')
OPENAI_KEY = os.getenv('OPENAI_KEY')
GEMINI_KEY = os.getenv('GEMINI_KEY')
client = MongoClient(MONGO_URI)
db = client['novascholar_db']
resources_collection = db['resources']
# Configure APIs
openai.api_key = OPENAI_KEY
genai.configure(api_key=GEMINI_KEY)
model = genai.GenerativeModel('gemini-pro')
def upload_resource(course_id, session_id, file_name, file_content, material_type):
# material_data = {
# "session_id": session_id,
# "course_id": course_id,
# "file_name": file_name,
# "file_content": file_content,
# "material_type": material_type,
# "uploaded_at": datetime.utcnow()
# }
# return resources_collection.insert_one(material_data)
# resource_id = ObjectId()
# Extract text content from the file
text_content = extract_text_from_file(file_content)
# Check if a resource with this file name already exists
existing_resource = resources_collection.find_one({
"session_id": session_id,
"file_name": file_name
})
if existing_resource:
return existing_resource["_id"]
# Read the file content
file_content.seek(0) # Reset the file pointer to the beginning
original_file_content = file_content.read()
resource_data = {
"_id": ObjectId(),
"course_id": course_id,
"session_id": session_id,
"file_name": file_name,
"file_type": file_content.type,
"text_content": text_content,
"file_content": original_file_content, # Store the original file content
"material_type": material_type,
"uploaded_at": datetime.utcnow()
}
resources_collection.insert_one(resource_data)
resource_id = resource_data["_id"]
courses_collection.update_one(
{
"course_id": course_id,
"sessions.session_id": session_id
},
{
"$push": {"sessions.$.pre_class.resources": resource_id}
}
)
# print("End of Upload Resource, Resource ID is: ", resource_id)
# return resource_id
if text_content:
create_vector_store(text_content, resource_id)
return resource_id
def assignment_submit(student_id, course_id, session_id, assignment_id, file_name, file_content, text_content, material_type):
# Read the file content
file_content.seek(0) # Reset the file pointer to the beginning
original_file_content = file_content.read()
assignment_data = {
"student_id": student_id,
"course_id": course_id,
"session_id": session_id,
"assignment_id": assignment_id,
"file_name": file_name,
"file_type": file_content.type,
"file_content": original_file_content, # Store the original file content
"text_content": text_content,
"material_type": material_type,
"submitted_at": datetime.utcnow(),
"file_url": "sample_url"
}
try:
courses_collection2.update_one(
{
"course_id": course_id,
"sessions.session_id": session_id,
"sessions.post_class.assignments.id": assignment_id
},
{
"$push": {"sessions.$.post_class.assignments.$[assignment].submissions": assignment_data}
},
array_filters=[{"assignment.id": assignment_id}]
)
return True
except Exception as db_error:
print(f"Error saving submission: {str(db_error)}")
return False
def extract_text_from_file(uploaded_file):
text = ""
file_type = uploaded_file.type
try:
if file_type == "text/plain":
text = uploaded_file.getvalue().decode("utf-8")
elif file_type == "application/pdf":
pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.getvalue()))
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(io.BytesIO(uploaded_file.getvalue()))
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
st.error(f"Error processing file: {str(e)}")
return None
def get_embedding(text):
response = openai.embeddings.create(
model="text-embedding-ada-002",
input=text
)
return response.data[0].embedding
def create_vector_store(text, resource_id):
# resource_object_id = ObjectId(resource_id)
# Ensure resource_id is an ObjectId
# if not isinstance(resource_id, ObjectId):
# resource_id = ObjectId(resource_id)
existing_vector = vectors_collection.find_one({
"resource_id": resource_id,
"text": text
})
if existing_vector:
print(f"Vector already exists for Resource ID: {resource_id}")
return
print(f"In Vector Store method, Resource ID is: {resource_id}")
document = Document(text=text)
embedding = get_embedding(text)
vector_data = {
"resource_id": resource_id,
"vector": embedding,
"text": text,
"created_at": datetime.utcnow()
}
vectors_collection.insert_one(vector_data)
# return VectorStoreIndex.from_documents([document])