Mbonea commited on
Commit
7fc5d88
·
1 Parent(s): 4649a3a

delete documents

Browse files
App/Embedding/utils/Initialize.py CHANGED
@@ -1,24 +1,73 @@
1
  from langchain.embeddings import HuggingFaceEmbeddings
2
  from langchain.docstore.document import Document
3
- # from langchain.vectorstores import Pinecone
4
- import os
5
- # import pinecone
 
6
 
7
- index_name = "movie-recommender-fast"
8
- model_name = "thenlper/gte-base"
9
- embeddings = HuggingFaceEmbeddings(model_name=model_name)
10
 
 
 
 
 
 
 
 
 
11
 
12
- # get api key from app.pinecone.io
13
- # PINECONE_API_KEY = (
14
- # os.environ.get("PINECONE_API_KEY") or "0712a5e4-bcf3-4152-a726-27ee3a2676bb"
15
- # )
16
- # # find your environment next to the api key in pinecone console
17
- # PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") or "us-west4-gcp-free"
18
 
19
- # pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # docsearch = Pinecone.from_existing_index(index_name, embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  def generateChunks(chunks, task_id, n=100):
@@ -53,6 +102,7 @@ def search(query: str, task_id: str):
53
  embedding=embeddings,
54
  index_name="test_embedding",
55
  )
 
56
  data = vectorstore.similarity_search(
57
  query=query,
58
  pre_filter={"text": {"path": "task_id", "query": task_id}},
@@ -65,10 +115,7 @@ def search(query: str, task_id: str):
65
  # data =[d.dict() for d in data]
66
  # print(data[0].metadata.exclude({'_id','embedding'}))
67
  # pprint.pprint(data[0].metadata)
68
- return [
69
- {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
70
- for d in data
71
- ]
72
  # agent =vectorstore.as_retriever(
73
 
74
  # )
@@ -87,4 +134,4 @@ def encode(temp: list[Document]):
87
  vectorstore.from_documents(
88
  temp, embedding=embeddings, collection=collection, index_name=index_name
89
  )
90
- # return embeddings.embed_documents(texts = [d.page_content for d in temp])
 
1
  from langchain.embeddings import HuggingFaceEmbeddings
2
  from langchain.docstore.document import Document
3
+ from langchain.vectorstores import MongoDBAtlasVectorSearch
4
+ from pymongo import MongoClient
5
+ from motor.motor_asyncio import AsyncIOMotorClient
6
+ import os,pprint
7
 
 
 
 
8
 
9
+ completion_base = os.environ.get("completion_base")
10
+ openai_api_key = os.environ.get("openai_api_key")
11
+ mongoDB = os.environ.get("MONGO_DB")
12
+ template = """### Given the following context
13
+ ### Context
14
+ {context}
15
+ ### Use it to explain the question: {question}
16
+ """
17
 
 
 
 
 
 
 
18
 
19
+ async def fetch_data(question, context):
20
+ url = completion_base
21
+
22
+ payload = json.dumps(
23
+ {
24
+ "messages": [
25
+ {
26
+ "role": "system",
27
+ "content": "### You provide explanations based on the provided context",
28
+ },
29
+ {
30
+ "role": "user",
31
+ "content": template.format(context=context, question=question),
32
+ },
33
+ ],
34
+ "model": "gpt-3.5-turbo",
35
+ "temperature": 1,
36
+ "presence_penalty": 0,
37
+ "top_p": 0.95,
38
+ "frequency_penalty": 0,
39
+ "stream": False,
40
+ }
41
+ )
42
+
43
+ headers = {
44
+ "Content-Type": "application/json",
45
+ "Authorization": f"Bearer {openai_api_key}",
46
+ }
47
+
48
+ async with aiohttp.ClientSession() as session:
49
+ async with session.post(url, headers=headers, data=payload) as response:
50
+ response = await response.json()
51
+ return response["choices"][0]["message"]["content"]
52
+
53
 
54
+ async def delete_documents(task_id):
55
+ client = AsyncIOMotorClient(mongoDB)
56
+ db = client["transcriptions"]
57
+ collection = db["videos"]
58
+
59
+ result = await collection.delete_many({"task_id": task_id})
60
+ print(f"Deleted {result.deleted_count} document(s)")
61
+
62
+
63
+ # mongo_client = MongoClient(
64
+ # mongoDB
65
+ # )
66
+ # model_name = "BAAI/bge-base-en"
67
+ # collection = mongo_client["transcriptions"]["videos"]
68
+ # embeddings = HuggingFaceEmbeddings(model_name=model_name)
69
+ # index_name = "test_embeddings"
70
+ # vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)
71
 
72
 
73
  def generateChunks(chunks, task_id, n=100):
 
102
  embedding=embeddings,
103
  index_name="test_embedding",
104
  )
105
+
106
  data = vectorstore.similarity_search(
107
  query=query,
108
  pre_filter={"text": {"path": "task_id", "query": task_id}},
 
115
  # data =[d.dict() for d in data]
116
  # print(data[0].metadata.exclude({'_id','embedding'}))
117
  # pprint.pprint(data[0].metadata)
118
+ return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data]
 
 
 
119
  # agent =vectorstore.as_retriever(
120
 
121
  # )
 
134
  vectorstore.from_documents(
135
  temp, embedding=embeddings, collection=collection, index_name=index_name
136
  )
137
+ # return embeddings.embed_documents(texts = [d.page_content for d in temp])
App/Transcription/TranscriptionRoutes.py CHANGED
@@ -20,7 +20,7 @@ from .Model import Transcriptions
20
  from .Utils.fastapi_tasks import perform_background_task
21
  import yt_dlp
22
  from fastapi_jwt_auth import AuthJWT
23
- from App.Embedding.utils.Initialize import delete_documents
24
 
25
  # from .Model import User
26
  # from sqlalchemy import and_
@@ -28,11 +28,13 @@ from App.Embedding.utils.Initialize import delete_documents
28
 
29
  transcription_router = APIRouter(tags=["Transcription"])
30
 
 
31
  def genUUID():
32
  uuid_value = uuid.uuid4()
33
  short_uuid = str(uuid_value)[:6]
34
  return short_uuid
35
 
 
36
  @transcription_router.get("/download-audio")
37
  async def download_audio(
38
  url: str,
@@ -43,7 +45,6 @@ async def download_audio(
43
  ),
44
  user: UserSchema = Depends(get_token_owner),
45
  ):
46
-
47
  youtube_url = url
48
  parsed_url = urlparse(youtube_url)
49
 
@@ -78,7 +79,7 @@ async def download_audio(
78
  }
79
 
80
  task = downloadfile.delay(url=url, ydl_opts=ydl_opts, model_size=model)
81
- response = {"task_id": task.id, "file_name": video_title }
82
  transcription_enrty = await Transcriptions.objects.create(
83
  user=user, youtubeLink=url, **response
84
  )
@@ -111,7 +112,7 @@ async def delete_transcription(
111
  await transcript.delete()
112
  task = AsyncResult(task_id)
113
  task.revoke(terminate=True)
114
- await delete_documents(task_id=task_id)
115
  return {"code": 200, "message": f"deleted {task_id}", "payload": None}
116
  else:
117
  return {
@@ -132,8 +133,8 @@ async def create_file(
132
  ),
133
  user: UserSchema = Depends(get_token_owner),
134
  ):
135
- extension = file.filename.split('.')[-1]
136
- file_name = f'{genUUID()}.{extension}'
137
  # Write the file to disk asynchronously
138
  Upload_dir = ""
139
  try:
@@ -155,7 +156,9 @@ async def create_file(
155
  transcription_enrty = await Transcriptions.objects.create(
156
  task_id=task.id, user=user, file_name=file_name
157
  )
158
- background_tasks.add_task(perform_background_task,file_name ,file=file, task_id=task.id)
 
 
159
  return {
160
  "file_size": file.size,
161
  "file_name": file.filename,
 
20
  from .Utils.fastapi_tasks import perform_background_task
21
  import yt_dlp
22
  from fastapi_jwt_auth import AuthJWT
23
+ # from App.Embedding.utils.Initialize import delete_documents
24
 
25
  # from .Model import User
26
  # from sqlalchemy import and_
 
28
 
29
  transcription_router = APIRouter(tags=["Transcription"])
30
 
31
+
32
  def genUUID():
33
  uuid_value = uuid.uuid4()
34
  short_uuid = str(uuid_value)[:6]
35
  return short_uuid
36
 
37
+
38
  @transcription_router.get("/download-audio")
39
  async def download_audio(
40
  url: str,
 
45
  ),
46
  user: UserSchema = Depends(get_token_owner),
47
  ):
 
48
  youtube_url = url
49
  parsed_url = urlparse(youtube_url)
50
 
 
79
  }
80
 
81
  task = downloadfile.delay(url=url, ydl_opts=ydl_opts, model_size=model)
82
+ response = {"task_id": task.id, "file_name": video_title}
83
  transcription_enrty = await Transcriptions.objects.create(
84
  user=user, youtubeLink=url, **response
85
  )
 
112
  await transcript.delete()
113
  task = AsyncResult(task_id)
114
  task.revoke(terminate=True)
115
+ # await delete_documents(task_id=task_id)
116
  return {"code": 200, "message": f"deleted {task_id}", "payload": None}
117
  else:
118
  return {
 
133
  ),
134
  user: UserSchema = Depends(get_token_owner),
135
  ):
136
+ extension = file.filename.split(".")[-1]
137
+ file_name = f"{genUUID()}.{extension}"
138
  # Write the file to disk asynchronously
139
  Upload_dir = ""
140
  try:
 
156
  transcription_enrty = await Transcriptions.objects.create(
157
  task_id=task.id, user=user, file_name=file_name
158
  )
159
+ background_tasks.add_task(
160
+ perform_background_task, file_name, file=file, task_id=task.id
161
+ )
162
  return {
163
  "file_size": file.size,
164
  "file_name": file.filename,