delete documents
Browse files
App/Embedding/utils/Initialize.py
CHANGED
@@ -1,24 +1,73 @@
|
|
1 |
from langchain.embeddings import HuggingFaceEmbeddings
|
2 |
from langchain.docstore.document import Document
|
3 |
-
|
4 |
-
import
|
5 |
-
|
|
|
6 |
|
7 |
-
index_name = "movie-recommender-fast"
|
8 |
-
model_name = "thenlper/gte-base"
|
9 |
-
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
# get api key from app.pinecone.io
|
13 |
-
# PINECONE_API_KEY = (
|
14 |
-
# os.environ.get("PINECONE_API_KEY") or "0712a5e4-bcf3-4152-a726-27ee3a2676bb"
|
15 |
-
# )
|
16 |
-
# # find your environment next to the api key in pinecone console
|
17 |
-
# PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") or "us-west4-gcp-free"
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
def generateChunks(chunks, task_id, n=100):
|
@@ -53,6 +102,7 @@ def search(query: str, task_id: str):
|
|
53 |
embedding=embeddings,
|
54 |
index_name="test_embedding",
|
55 |
)
|
|
|
56 |
data = vectorstore.similarity_search(
|
57 |
query=query,
|
58 |
pre_filter={"text": {"path": "task_id", "query": task_id}},
|
@@ -65,10 +115,7 @@ def search(query: str, task_id: str):
|
|
65 |
# data =[d.dict() for d in data]
|
66 |
# print(data[0].metadata.exclude({'_id','embedding'}))
|
67 |
# pprint.pprint(data[0].metadata)
|
68 |
-
return [
|
69 |
-
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
|
70 |
-
for d in data
|
71 |
-
]
|
72 |
# agent =vectorstore.as_retriever(
|
73 |
|
74 |
# )
|
@@ -87,4 +134,4 @@ def encode(temp: list[Document]):
|
|
87 |
vectorstore.from_documents(
|
88 |
temp, embedding=embeddings, collection=collection, index_name=index_name
|
89 |
)
|
90 |
-
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|
|
|
1 |
from langchain.embeddings import HuggingFaceEmbeddings
|
2 |
from langchain.docstore.document import Document
|
3 |
+
from langchain.vectorstores import MongoDBAtlasVectorSearch
|
4 |
+
from pymongo import MongoClient
|
5 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
6 |
+
import os,pprint
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
+
completion_base = os.environ.get("completion_base")
|
10 |
+
openai_api_key = os.environ.get("openai_api_key")
|
11 |
+
mongoDB = os.environ.get("MONGO_DB")
|
12 |
+
template = """### Given the following context
|
13 |
+
### Context
|
14 |
+
{context}
|
15 |
+
### Use it to explain the question: {question}
|
16 |
+
"""
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
async def fetch_data(question, context):
|
20 |
+
url = completion_base
|
21 |
+
|
22 |
+
payload = json.dumps(
|
23 |
+
{
|
24 |
+
"messages": [
|
25 |
+
{
|
26 |
+
"role": "system",
|
27 |
+
"content": "### You provide explanations based on the provided context",
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"role": "user",
|
31 |
+
"content": template.format(context=context, question=question),
|
32 |
+
},
|
33 |
+
],
|
34 |
+
"model": "gpt-3.5-turbo",
|
35 |
+
"temperature": 1,
|
36 |
+
"presence_penalty": 0,
|
37 |
+
"top_p": 0.95,
|
38 |
+
"frequency_penalty": 0,
|
39 |
+
"stream": False,
|
40 |
+
}
|
41 |
+
)
|
42 |
+
|
43 |
+
headers = {
|
44 |
+
"Content-Type": "application/json",
|
45 |
+
"Authorization": f"Bearer {openai_api_key}",
|
46 |
+
}
|
47 |
+
|
48 |
+
async with aiohttp.ClientSession() as session:
|
49 |
+
async with session.post(url, headers=headers, data=payload) as response:
|
50 |
+
response = await response.json()
|
51 |
+
return response["choices"][0]["message"]["content"]
|
52 |
+
|
53 |
|
54 |
+
async def delete_documents(task_id):
|
55 |
+
client = AsyncIOMotorClient(mongoDB)
|
56 |
+
db = client["transcriptions"]
|
57 |
+
collection = db["videos"]
|
58 |
+
|
59 |
+
result = await collection.delete_many({"task_id": task_id})
|
60 |
+
print(f"Deleted {result.deleted_count} document(s)")
|
61 |
+
|
62 |
+
|
63 |
+
# mongo_client = MongoClient(
|
64 |
+
# mongoDB
|
65 |
+
# )
|
66 |
+
# model_name = "BAAI/bge-base-en"
|
67 |
+
# collection = mongo_client["transcriptions"]["videos"]
|
68 |
+
# embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
69 |
+
# index_name = "test_embeddings"
|
70 |
+
# vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)
|
71 |
|
72 |
|
73 |
def generateChunks(chunks, task_id, n=100):
|
|
|
102 |
embedding=embeddings,
|
103 |
index_name="test_embedding",
|
104 |
)
|
105 |
+
|
106 |
data = vectorstore.similarity_search(
|
107 |
query=query,
|
108 |
pre_filter={"text": {"path": "task_id", "query": task_id}},
|
|
|
115 |
# data =[d.dict() for d in data]
|
116 |
# print(data[0].metadata.exclude({'_id','embedding'}))
|
117 |
# pprint.pprint(data[0].metadata)
|
118 |
+
return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data]
|
|
|
|
|
|
|
119 |
# agent =vectorstore.as_retriever(
|
120 |
|
121 |
# )
|
|
|
134 |
vectorstore.from_documents(
|
135 |
temp, embedding=embeddings, collection=collection, index_name=index_name
|
136 |
)
|
137 |
+
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|
App/Transcription/TranscriptionRoutes.py
CHANGED
@@ -20,7 +20,7 @@ from .Model import Transcriptions
|
|
20 |
from .Utils.fastapi_tasks import perform_background_task
|
21 |
import yt_dlp
|
22 |
from fastapi_jwt_auth import AuthJWT
|
23 |
-
from App.Embedding.utils.Initialize import delete_documents
|
24 |
|
25 |
# from .Model import User
|
26 |
# from sqlalchemy import and_
|
@@ -28,11 +28,13 @@ from App.Embedding.utils.Initialize import delete_documents
|
|
28 |
|
29 |
transcription_router = APIRouter(tags=["Transcription"])
|
30 |
|
|
|
31 |
def genUUID():
|
32 |
uuid_value = uuid.uuid4()
|
33 |
short_uuid = str(uuid_value)[:6]
|
34 |
return short_uuid
|
35 |
|
|
|
36 |
@transcription_router.get("/download-audio")
|
37 |
async def download_audio(
|
38 |
url: str,
|
@@ -43,7 +45,6 @@ async def download_audio(
|
|
43 |
),
|
44 |
user: UserSchema = Depends(get_token_owner),
|
45 |
):
|
46 |
-
|
47 |
youtube_url = url
|
48 |
parsed_url = urlparse(youtube_url)
|
49 |
|
@@ -78,7 +79,7 @@ async def download_audio(
|
|
78 |
}
|
79 |
|
80 |
task = downloadfile.delay(url=url, ydl_opts=ydl_opts, model_size=model)
|
81 |
-
response = {"task_id": task.id, "file_name": video_title
|
82 |
transcription_enrty = await Transcriptions.objects.create(
|
83 |
user=user, youtubeLink=url, **response
|
84 |
)
|
@@ -111,7 +112,7 @@ async def delete_transcription(
|
|
111 |
await transcript.delete()
|
112 |
task = AsyncResult(task_id)
|
113 |
task.revoke(terminate=True)
|
114 |
-
await delete_documents(task_id=task_id)
|
115 |
return {"code": 200, "message": f"deleted {task_id}", "payload": None}
|
116 |
else:
|
117 |
return {
|
@@ -132,8 +133,8 @@ async def create_file(
|
|
132 |
),
|
133 |
user: UserSchema = Depends(get_token_owner),
|
134 |
):
|
135 |
-
extension = file.filename.split(
|
136 |
-
file_name = f
|
137 |
# Write the file to disk asynchronously
|
138 |
Upload_dir = ""
|
139 |
try:
|
@@ -155,7 +156,9 @@ async def create_file(
|
|
155 |
transcription_enrty = await Transcriptions.objects.create(
|
156 |
task_id=task.id, user=user, file_name=file_name
|
157 |
)
|
158 |
-
background_tasks.add_task(
|
|
|
|
|
159 |
return {
|
160 |
"file_size": file.size,
|
161 |
"file_name": file.filename,
|
|
|
20 |
from .Utils.fastapi_tasks import perform_background_task
|
21 |
import yt_dlp
|
22 |
from fastapi_jwt_auth import AuthJWT
|
23 |
+
# from App.Embedding.utils.Initialize import delete_documents
|
24 |
|
25 |
# from .Model import User
|
26 |
# from sqlalchemy import and_
|
|
|
28 |
|
29 |
transcription_router = APIRouter(tags=["Transcription"])
|
30 |
|
31 |
+
|
32 |
def genUUID():
|
33 |
uuid_value = uuid.uuid4()
|
34 |
short_uuid = str(uuid_value)[:6]
|
35 |
return short_uuid
|
36 |
|
37 |
+
|
38 |
@transcription_router.get("/download-audio")
|
39 |
async def download_audio(
|
40 |
url: str,
|
|
|
45 |
),
|
46 |
user: UserSchema = Depends(get_token_owner),
|
47 |
):
|
|
|
48 |
youtube_url = url
|
49 |
parsed_url = urlparse(youtube_url)
|
50 |
|
|
|
79 |
}
|
80 |
|
81 |
task = downloadfile.delay(url=url, ydl_opts=ydl_opts, model_size=model)
|
82 |
+
response = {"task_id": task.id, "file_name": video_title}
|
83 |
transcription_enrty = await Transcriptions.objects.create(
|
84 |
user=user, youtubeLink=url, **response
|
85 |
)
|
|
|
112 |
await transcript.delete()
|
113 |
task = AsyncResult(task_id)
|
114 |
task.revoke(terminate=True)
|
115 |
+
# await delete_documents(task_id=task_id)
|
116 |
return {"code": 200, "message": f"deleted {task_id}", "payload": None}
|
117 |
else:
|
118 |
return {
|
|
|
133 |
),
|
134 |
user: UserSchema = Depends(get_token_owner),
|
135 |
):
|
136 |
+
extension = file.filename.split(".")[-1]
|
137 |
+
file_name = f"{genUUID()}.{extension}"
|
138 |
# Write the file to disk asynchronously
|
139 |
Upload_dir = ""
|
140 |
try:
|
|
|
156 |
transcription_enrty = await Transcriptions.objects.create(
|
157 |
task_id=task.id, user=user, file_name=file_name
|
158 |
)
|
159 |
+
background_tasks.add_task(
|
160 |
+
perform_background_task, file_name, file=file, task_id=task.id
|
161 |
+
)
|
162 |
return {
|
163 |
"file_size": file.size,
|
164 |
"file_name": file.filename,
|