dsmultimedika commited on
Commit
69beac6
·
1 Parent(s): b39c0ba

fix : change parsing

Browse files
api/function.py CHANGED
@@ -29,12 +29,12 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
29
  user_id="admin_book_uploaded",
30
  )
31
 
32
- # Upload to AWS
33
- file_name = f"{reference['title']}"
34
- aws_loader = Loader()
35
 
36
- file_obj = file
37
- aws_loader.upload_to_s3(file_obj, file_name)
38
 
39
  uploader = Uploader(reference, file)
40
 
 
29
  user_id="admin_book_uploaded",
30
  )
31
 
32
+ # # Upload to AWS
33
+ # file_name = f"{reference['title']}"
34
+ # aws_loader = Loader()
35
 
36
+ # file_obj = file
37
+ # aws_loader.upload_to_s3(file_obj, file_name)
38
 
39
  uploader = Uploader(reference, file)
40
 
api/router/book.py CHANGED
@@ -64,10 +64,6 @@ async def upload_file(
64
  if auth_response:
65
  return auth_response
66
 
67
- # Create a new Metadata object
68
- book_query = BookQuery(user)
69
- book_query.add_book(db, title, author, category_id, year, publisher)
70
-
71
  logging.info("Database Inserted")
72
 
73
  # Query the category based on category_id
@@ -85,6 +81,10 @@ async def upload_file(
85
 
86
  # Process the file and handle data ingestion
87
  response = await data_ingestion(reference, file)
 
 
 
 
88
 
89
  return {
90
  "filename": file.filename,
 
64
  if auth_response:
65
  return auth_response
66
 
 
 
 
 
67
  logging.info("Database Inserted")
68
 
69
  # Query the category based on category_id
 
81
 
82
  # Process the file and handle data ingestion
83
  response = await data_ingestion(reference, file)
84
+
85
+ # Create a new Metadata object
86
+ # book_query = BookQuery(user)
87
+ # book_query.add_book(db, title, author, category_id, year, publisher)
88
 
89
  return {
90
  "filename": file.filename,
api/router/category.py CHANGED
@@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends
4
 
5
  from db.models import Category
6
  from db.database import get_db
 
7
  from service.dto import CategoryCreate
8
 
9
  from script.vector_db import IndexManager
@@ -19,13 +20,19 @@ db_dependency = Annotated[Session, Depends(get_db)]
19
 
20
 
21
  @router.get("/category")
22
- async def get_all_categories(user: user_dependency, db: db_dependency):
23
  if user is None or user.get("role_id") != 1:
24
  return JSONResponse(status_code=401, content="Authentication Failed")
25
 
26
  try:
27
  # Logic to retrieve all categories
28
- categories = db.query(Category).all()
 
 
 
 
 
 
29
  if not categories:
30
  return JSONResponse(status_code=404, content="No categories found")
31
 
@@ -43,7 +50,7 @@ async def get_all_categories(user: user_dependency, db: db_dependency):
43
 
44
 
45
  @router.get("/category/{category_id}")
46
- async def get_categories_by_ids(
47
  user: user_dependency,
48
  db: db_dependency,
49
  category_id: int,
@@ -53,7 +60,8 @@ async def get_categories_by_ids(
53
 
54
  try:
55
  # Fetch categories based on the list of provided category_ids
56
- category = db.query(Category).filter(Category.id == category_id).first()
 
57
 
58
  if category is None:
59
  return JSONResponse(status_code=404, content="No categories found for the given IDs")
@@ -75,21 +83,17 @@ async def create_category(user: user_dependency, db: db_dependency, category: Ca
75
 
76
  try:
77
  # Check if category already exists
78
- existing_category = (
79
- db.query(Category).filter(Category.category == category.category_name).first()
80
- )
81
- if existing_category:
82
  return JSONResponse(status_code=400, content="Category already exists")
83
-
84
- # Logic to create a new category
85
- new_category = Category(category=category) # Assuming Category is your model
86
- db.add(new_category)
87
- db.commit()
88
- db.refresh(new_category)
89
 
90
  return {
91
  "message": "Category created successfully",
92
- "category_id": new_category.id,
93
  }
94
 
95
  except IntegrityError:
@@ -99,13 +103,6 @@ async def create_category(user: user_dependency, db: db_dependency, category: Ca
99
  content="Database integrity error: possibly a duplicate entry.",
100
  )
101
 
102
- except SQLAlchemyError as e:
103
- db.rollback()
104
- return JSONResponse(
105
- status_code=500, content="Database error occurred: " + str(e)
106
- )
107
-
108
-
109
  @router.put("/category/{category_id}")
110
  async def update_category(
111
  user: user_dependency, db: db_dependency, category_id: int, category: CategoryCreate
 
4
 
5
  from db.models import Category
6
  from db.database import get_db
7
+ from db.query.query_category import CategoryQuery
8
  from service.dto import CategoryCreate
9
 
10
  from script.vector_db import IndexManager
 
20
 
21
 
22
  @router.get("/category")
23
+ async def get_all_categories_router(user: user_dependency, db: db_dependency):
24
  if user is None or user.get("role_id") != 1:
25
  return JSONResponse(status_code=401, content="Authentication Failed")
26
 
27
  try:
28
  # Logic to retrieve all categories
29
+ category_query = CategoryQuery(user)
30
+ categories = category_query.get_all_categories(db)
31
+
32
+ print(categories)
33
+
34
+
35
+ # categories = db.query(Category).all()
36
  if not categories:
37
  return JSONResponse(status_code=404, content="No categories found")
38
 
 
50
 
51
 
52
  @router.get("/category/{category_id}")
53
+ async def get_category_by_id_router(
54
  user: user_dependency,
55
  db: db_dependency,
56
  category_id: int,
 
60
 
61
  try:
62
  # Fetch categories based on the list of provided category_ids
63
+ category_query = CategoryQuery(user)
64
+ category = category_query.get_category_by_id(db, category_id)
65
 
66
  if category is None:
67
  return JSONResponse(status_code=404, content="No categories found for the given IDs")
 
83
 
84
  try:
85
  # Check if category already exists
86
+ category_query = CategoryQuery(user)
87
+ existing_category = category_query.get_existing_category(db, category.category_name)
88
+ if not isinstance(existing_category,JSONResponse):
 
89
  return JSONResponse(status_code=400, content="Category already exists")
90
+
91
+ # Add category
92
+ category_query.add_category(db, category.category_name)
93
+ print("category added")
 
 
94
 
95
  return {
96
  "message": "Category created successfully",
 
97
  }
98
 
99
  except IntegrityError:
 
103
  content="Database integrity error: possibly a duplicate entry.",
104
  )
105
 
 
 
 
 
 
 
 
106
  @router.put("/category/{category_id}")
107
  async def update_category(
108
  user: user_dependency, db: db_dependency, category_id: int, category: CategoryCreate
api/router/testing.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, Form
2
+ from fastapi.security import OAuth2PasswordBearer
3
+ import httpx
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ app = FastAPI()
11
+
12
+ # Bearer token for API authentication
13
+ BEARER_TOKEN = os.getenv("MEDUCINE_API_BEARER_TOKEN")
14
+
15
+ # Base URL for the Meducine API
16
+ BASE_URL = os.getenv("BASE_URL")
17
+
18
+ # OAuth2PasswordBearer provides the token as a dependency
19
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/login")
20
+
21
+ @app.post("/login")
22
+ async def login(email: str = Form(...), password: str = Form(...)):
23
+ async with httpx.AsyncClient() as client:
24
+ try:
25
+ response = await client.post(
26
+ f"{BASE_URL}/actions/meducine-restapi/auth/login",
27
+ data={"email": email, "password": password},
28
+ headers={"Authorization": f"Bearer {BEARER_TOKEN}"}
29
+ )
30
+ response.raise_for_status() # Raise an error for bad responses (4xx or 5xx)
31
+ return handle_response(response) # Assuming this function formats the response correctly
32
+ except httpx.HTTPStatusError as e:
33
+ raise HTTPException(status_code=e.response.status_code, detail=e.response.text)
34
+ except Exception as e:
35
+ raise HTTPException(status_code=500, detail=str(e))
36
+
37
+ @app.post("/actions/meducine-restapi/auth/logout")
38
+ async def logout(email: str = Form(...), password: str = Form(...)):
39
+ async with httpx.AsyncClient() as client:
40
+ response = await client.post(
41
+ f"{BASE_URL}/actions/meducine-restapi/auth/logout",
42
+ data={"email": email, "password": password},
43
+ headers={"Authorization": f"Bearer {BEARER_TOKEN}"}
44
+ )
45
+ return handle_response(response)
46
+
47
+ @app.get("/actions/meducine-restapi/auth/identity")
48
+ async def get_identity(token: str = Depends(oauth2_scheme)):
49
+ async with httpx.AsyncClient() as client:
50
+ response = await client.get(
51
+ f"{BASE_URL}/actions/meducine-restapi/auth/identity",
52
+ headers={"Authorization": f"Bearer {token}"}
53
+ )
54
+ return handle_response(response)
55
+
56
+ @app.get("/actions/meducine-restapi/user/has-premium-access")
57
+ async def check_premium_access(feature: str, token: str = Depends(oauth2_scheme)):
58
+ async with httpx.AsyncClient() as client:
59
+ response = await client.get(
60
+ f"{BASE_URL}/actions/meducine-restapi/user/has-premium-access",
61
+ params={"feature": feature},
62
+ headers={"Authorization": f"Bearer {token}"}
63
+ )
64
+ return handle_response(response)
65
+
66
+ def handle_response(response: httpx.Response):
67
+ """
68
+ Handles the response from the Meducine API, returning appropriate responses based on status codes.
69
+ """
70
+ if response.status_code in range(200, 300):
71
+ return response.json() # Successful request
72
+ elif response.status_code in range(400, 500):
73
+ raise HTTPException(status_code=response.status_code, detail=response.json()) # Client error
74
+ elif response.status_code in range(500, 600):
75
+ raise HTTPException(status_code=response.status_code, detail="Server error") # Server error
76
+ else:
77
+ raise HTTPException(status_code=500, detail="Unexpected error")
78
+
79
+ # Run the application
80
+ if __name__ == "__main__":
81
+ import uvicorn
82
+ uvicorn.run(app, host="127.0.0.1", port=8000)
core/chat/bot_service.py CHANGED
@@ -10,7 +10,7 @@ from llama_index.core.llms import MessageRole
10
 
11
  from core.chat.engine import Engine
12
  from core.chat.chatstore import ChatStore
13
- from core.parser import clean_text, update_response, renumber_sources
14
 
15
  from service.dto import ChatMessage
16
  from pymongo.mongo_client import MongoClient
@@ -47,7 +47,7 @@ class ChatCompletionService:
47
 
48
  # Update response and renumber sources
49
  response = update_response(str(response))
50
- contents = renumber_sources(contents)
51
 
52
  # Add contents to metadata
53
  metadata_collection = self._attach_contents_to_metadata(contents, metadata_collection)
 
10
 
11
  from core.chat.engine import Engine
12
  from core.chat.chatstore import ChatStore
13
+ from core.parser import clean_text, update_response, sort_and_renumber_sources
14
 
15
  from service.dto import ChatMessage
16
  from pymongo.mongo_client import MongoClient
 
47
 
48
  # Update response and renumber sources
49
  response = update_response(str(response))
50
+ contents = sort_and_renumber_sources(contents)
51
 
52
  # Add contents to metadata
53
  metadata_collection = self._attach_contents_to_metadata(contents, metadata_collection)
core/parser.py CHANGED
@@ -75,6 +75,29 @@ def renumber_sources(source_list):
75
  return new_sources
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def seperate_to_list(text):
79
  # Step 1: Split the text by line breaks (\n)
80
  lines = text.split("\n")
 
75
  return new_sources
76
 
77
 
78
+ def sort_and_renumber_sources(source_list):
79
+ """
80
+ This function takes a list of sources, sorts them based on the source number,
81
+ and renumbers them sequentially starting from 1.
82
+
83
+ :param source_list: List of strings containing source information.
84
+ :return: Sorted and renumbered list of sources.
85
+ """
86
+
87
+ # Function to extract source number
88
+ def extract_source_number(source):
89
+ match = re.search(r"Source (\d+)", source)
90
+ return int(match.group(1)) if match else float('inf')
91
+
92
+ # Sort sources based on the source number
93
+ sorted_sources = sorted(source_list, key=extract_source_number)
94
+
95
+ # Reassign the numbering in the sorted sources
96
+ for idx, source in enumerate(sorted_sources, 1):
97
+ sorted_sources[idx-1] = re.sub(r"Source \d+", f"Source {idx}", source)
98
+
99
+ return sorted_sources
100
+
101
  def seperate_to_list(text):
102
  # Step 1: Split the text by line breaks (\n)
103
  lines = text.split("\n")
db/query/base_query.py CHANGED
@@ -81,21 +81,30 @@ class BaseQuery:
81
 
82
  def update(self, db, model, id, update_data, filter_conditions=None):
83
  """Update an entry by ID."""
 
84
  query = select(model).where(model.id == id)
 
 
85
  if filter_conditions:
86
- query = query.where(*filter_conditions)
87
 
 
88
  not_found_message = f"Entry with ID {id} not found."
89
  entry = self._fetch(db, query, not_found_message, multiple=False)
90
 
 
91
  if isinstance(entry, JSONResponse):
92
  return entry
93
 
94
- db.execute(
95
- update(model)
96
- .where(model.id == id, model.user_id == self.user_id)
97
- .values(update_data)
98
- )
 
 
 
 
99
  return self._handle_commit(db)
100
 
101
  def update_entries(self, db, model, update_data, filter_conditions=None):
@@ -128,8 +137,6 @@ class BaseQuery:
128
  else:
129
  query = select(model)
130
 
131
- # Apply filtering by user ID and optional conditions
132
- query = query
133
  if id:
134
  query = query.where(model.id == id)
135
  if filter_conditions:
 
81
 
82
  def update(self, db, model, id, update_data, filter_conditions=None):
83
  """Update an entry by ID."""
84
+ # Define the initial query to fetch the entry
85
  query = select(model).where(model.id == id)
86
+
87
+ # Append additional filter conditions if provided
88
  if filter_conditions:
89
+ query = query.where(model.id == id, *filter_conditions)
90
 
91
+ # Attempt to fetch the entry
92
  not_found_message = f"Entry with ID {id} not found."
93
  entry = self._fetch(db, query, not_found_message, multiple=False)
94
 
95
+ # Check if the entry was found
96
  if isinstance(entry, JSONResponse):
97
  return entry
98
 
99
+ # Prepare the update statement
100
+ stmt = update(model).where(model.id == id).values(update_data)
101
+ db.execute(stmt)
102
+
103
+ # If filter conditions were provided, apply them to the update as well
104
+ if filter_conditions:
105
+ filter_stmt = update(model).where(model.id == id, *filter_conditions).values(update_data)
106
+ db.execute(filter_stmt)
107
+
108
  return self._handle_commit(db)
109
 
110
  def update_entries(self, db, model, update_data, filter_conditions=None):
 
137
  else:
138
  query = select(model)
139
 
 
 
140
  if id:
141
  query = query.where(model.id == id)
142
  if filter_conditions:
db/query/query_book.py CHANGED
@@ -81,7 +81,12 @@ class BookQuery(BaseQuery):
81
 
82
  def update_book(self, db, book_id, title, author):
83
  update_data = {"title": title, "author": author}
84
- self.update(db, Metadata, book_id, update_data)
 
 
 
 
 
85
 
86
  def delete_book(self, db, book_id):
87
  self.delete(db, Metadata, book_id)
@@ -92,15 +97,6 @@ class BookQuery(BaseQuery):
92
  def get_metadata_books(self, db, metadata_id):
93
  return self.get(db, Metadata, id=metadata_id)
94
 
95
- # def get_title_from_session(self, db, metadata_id, session_id):
96
- # model = Session_Publisher
97
- # columns = [Metadata.title]
98
- # join_models = [Session_Publisher.id == session_id, Metadata.id == metadata_id]
99
-
100
- # titles = self.get_all_with_join_columns(db, model, columns, join_models)
101
-
102
- # return titles
103
-
104
  def get_title_from_session(self, db, metadata_id, session_id):
105
  model = Session_Publisher
106
  columns = [Metadata.title]
 
81
 
82
  def update_book(self, db, book_id, title, author):
83
  update_data = {"title": title, "author": author}
84
+ self.update(
85
+ db,
86
+ model=Metadata,
87
+ id=book_id,
88
+ update_data=update_data,
89
+ )
90
 
91
  def delete_book(self, db, book_id):
92
  self.delete(db, Metadata, book_id)
 
97
  def get_metadata_books(self, db, metadata_id):
98
  return self.get(db, Metadata, id=metadata_id)
99
 
 
 
 
 
 
 
 
 
 
100
  def get_title_from_session(self, db, metadata_id, session_id):
101
  model = Session_Publisher
102
  columns = [Metadata.title]
db/query/query_bot.py CHANGED
@@ -39,7 +39,13 @@ class BotQuery(BaseQuery):
39
  Session_Publisher.metadata_id == metadata_id,
40
  ]
41
 
42
- sessions = self.get_columns(db, model=model, columns=columns, filter_conditions=filter_conditions, multiple=True)
 
 
 
 
 
 
43
 
44
  return sessions
45
 
 
39
  Session_Publisher.metadata_id == metadata_id,
40
  ]
41
 
42
+ sessions = self.get_columns(
43
+ db,
44
+ model=model,
45
+ columns=columns,
46
+ filter_conditions=filter_conditions,
47
+ multiple=True,
48
+ )
49
 
50
  return sessions
51
 
db/query/query_category.py CHANGED
@@ -1,17 +1,16 @@
1
  from db.models import Category, Metadata
2
  from db.query.base_query import BaseQuery
3
- from sqlalchemy import select, delete, update
4
 
5
  class CategoryQuery(BaseQuery):
6
  def __init__(self, user):
7
  super().__init__(user)
8
 
9
- def add_category(self, db, name):
10
- new_category = Category(name=name, user_id=self.user["id"])
11
  return self.add(db, new_category)
12
 
13
- def update_category(self, db, category_id, name):
14
- update_data = {"name": name}
15
  self.update(db, Category, category_id, update_data)
16
 
17
  def delete_category(self, db, category_id):
@@ -20,7 +19,22 @@ class CategoryQuery(BaseQuery):
20
  def get_category(self, db, category_id):
21
  columns = [Category.category]
22
  model = Category
23
- return self.get_columns(db,model=model, columns=columns, id=category_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def get_current_category(self, db, metadata_id):
26
  columns = [Category.category]
@@ -38,4 +52,6 @@ class CategoryQuery(BaseQuery):
38
  return result
39
 
40
  def get_all_categories(self, db):
41
- return self.get(db, Category, multiple=True)
 
 
 
1
  from db.models import Category, Metadata
2
  from db.query.base_query import BaseQuery
 
3
 
4
  class CategoryQuery(BaseQuery):
5
  def __init__(self, user):
6
  super().__init__(user)
7
 
8
+ def add_category(self, db, category_name):
9
+ new_category = Category(category=category_name)
10
  return self.add(db, new_category)
11
 
12
+ def update_category(self, db, category_id, category_name):
13
+ update_data = {"category": category_name}
14
  self.update(db, Category, category_id, update_data)
15
 
16
  def delete_category(self, db, category_id):
 
19
  def get_category(self, db, category_id):
20
  columns = [Category.category]
21
  model = Category
22
+ results = self.get_columns(db,model=model, columns=columns, id=category_id)
23
+ return results
24
+
25
+ def get_existing_category(self, db, category_name):
26
+ columns = [Category.category]
27
+ model = Category
28
+ filter_condition = [Category.category == category_name]
29
+ results = self.get_columns(db, columns=columns, model=model, filter_conditions=filter_condition)
30
+ return results
31
+
32
+ def get_category_by_id(self, db, category_id):
33
+ model = Category
34
+ filter_conditions = [Category.id == category_id]
35
+ results = self.get(db, model=model, filter_conditions=filter_conditions)
36
+
37
+ return results
38
 
39
  def get_current_category(self, db, metadata_id):
40
  columns = [Category.category]
 
52
  return result
53
 
54
  def get_all_categories(self, db):
55
+ results = self.get(db, Category, multiple=True)
56
+ categories = [category[0] for category in results]
57
+ return categories
db/query/query_user_meta.py CHANGED
@@ -6,24 +6,6 @@ from db.query.base_query import BaseQuery
6
  class UserMetaQuery(BaseQuery):
7
  def __init__(self, user):
8
  super().__init__(user)
9
-
10
- # def get_user_meta_entries(self, db):
11
- # """Fetch all user meta entries joined with metadata and category."""
12
- # join_models = [Metadata, Category]
13
- # print(join_models)
14
- # join_conditions = [
15
- # User_Meta.metadata_id == Metadata.id,
16
- # Metadata.category_id == Category.id,
17
- # ]
18
- # print(join_conditions)
19
-
20
- # result = self.get_all_with_joins(
21
- # db,
22
- # model=User_Meta,
23
- # join_models=join_models,
24
- # join_conditions=join_conditions,
25
- # )
26
- # return result
27
 
28
  def get_user_meta_entries(self, db):
29
  """Fetch all user meta entries joined with metadata and category."""
 
6
  class UserMetaQuery(BaseQuery):
7
  def __init__(self, user):
8
  super().__init__(user)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def get_user_meta_entries(self, db):
11
  """Fetch all user meta entries joined with metadata and category."""
db/repository.py DELETED
@@ -1,36 +0,0 @@
1
- from databases import Database
2
- import datetime
3
-
4
-
5
- def get_db_conn(config):
6
- db_url = f"{config.DB_URI}"
7
- return Database(db_url)
8
-
9
-
10
- class Repository:
11
- def __init__(self, db_conn):
12
- self.db_conn = db_conn
13
-
14
- async def get_by_query(self, query, param):
15
- results = await self.db_conn.fetch_all(query, param)
16
- print("result get _by query", results)
17
- return [dict(result) for result in results]
18
-
19
- async def _fetch_one(self, query, param):
20
- result = await self.db_conn.fetch_one(query, param)
21
- return dict(result) if result is not None else result
22
-
23
- async def _exec(self, query, param):
24
- return await self.db_conn.execute(query, param)
25
-
26
- async def _exec_many(self, query, params):
27
- return await self.db_conn.execute_many(query, params)
28
-
29
- def update_params(self, params, update=False):
30
- current_time = datetime.datetime.now()
31
- if update == False:
32
-
33
- params.update({"createdAt": current_time, "updatedAt": current_time})
34
- else:
35
- params.update({"updatedAt": current_time})
36
- return params
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
script/document_uploader.py CHANGED
@@ -2,15 +2,16 @@ from llama_index.core.ingestion import IngestionPipeline
2
  from llama_index.embeddings.openai import OpenAIEmbedding
3
  from config import PINECONE_CONFIG
4
  from pinecone.grpc import PineconeGRPC as Pinecone
5
- from service.reader import Reader
6
  from script.get_metadata import Metadata
7
- from fastapi import UploadFile,status
8
  from fastapi.responses import JSONResponse
9
 
10
  from llama_index.core.node_parser import (
11
  SentenceSplitter,
12
  SemanticSplitterNodeParser,
13
  )
 
14
 
15
  # from script.get_topic import extract_topic
16
 
@@ -23,18 +24,18 @@ class Uploader:
23
  def __init__(self, reference, file: UploadFile):
24
  self.file = file
25
  # self.content_table = content_table
26
- self.reader = Reader()
27
  self.reference = reference
28
  self.metadata = Metadata(reference)
29
 
30
- async def ingest_documents(self, file: UploadFile):
31
- """Load documents from the storage path."""
32
- documents = await self.reader.read_from_uploadfile(file)
33
- print("Banyak document : ", len(documents))
34
- print("document successfully ingested")
 
 
35
 
36
- return documents
37
-
38
  def check_existing_metadata(self, pinecone_index, title, random_vector):
39
  try:
40
  result = pinecone_index.query(
@@ -53,17 +54,17 @@ class Uploader:
53
 
54
  async def process_documents(self):
55
  # Ingest documents
56
- documents = await self.ingest_documents(self.file)
57
-
58
- # topic_extractor = extract_topic(self.reference, self.content_table)
59
-
60
- embed_model = OpenAIEmbedding()
61
 
62
  # Get metadata
63
- documents_with_metadata = self.metadata.apply_metadata(documents)
64
-
 
 
 
65
  # document_filtered = self.filter_document(documents_with_metadata)
66
 
 
67
  # Set up the ingestion pipeline
68
  pipeline = IngestionPipeline(
69
  transformations=[
@@ -75,7 +76,7 @@ class Uploader:
75
  # topic_extractor,
76
  ]
77
  )
78
-
79
  # splitter = SemanticSplitterNodeParser(
80
  # buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
81
  # )
@@ -85,12 +86,14 @@ class Uploader:
85
  nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
86
  # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
87
  return nodes_with_metadata
88
-
89
  except Exception as e:
90
  try:
91
  # If the first method fails, fallback to sentence splitter
92
  sentence_splitter = SentenceSplitter(chunk_size=512)
93
- nodes_with_metadata = sentence_splitter.get_nodes_from_documents(documents_with_metadata)
 
 
94
  print("Pipeline processing completed with SentenceSplitter fallback.")
95
  return nodes_with_metadata
96
  except Exception as fallback_error:
@@ -100,7 +103,7 @@ class Uploader:
100
  status_code=500,
101
  content="An internal server error occurred during pipeline processing.",
102
  )
103
-
104
  def filter_document(self, documents):
105
  api_key = PINECONE_CONFIG.PINECONE_API_KEY
106
  client = Pinecone(api_key=api_key)
@@ -117,4 +120,4 @@ class Uploader:
117
  if len(result) == 0:
118
  filtered_documents.append(doc)
119
 
120
- return filtered_documents
 
2
  from llama_index.embeddings.openai import OpenAIEmbedding
3
  from config import PINECONE_CONFIG
4
  from pinecone.grpc import PineconeGRPC as Pinecone
5
+ # from service.reader import Reader
6
  from script.get_metadata import Metadata
7
+ from fastapi import UploadFile, status
8
  from fastapi.responses import JSONResponse
9
 
10
  from llama_index.core.node_parser import (
11
  SentenceSplitter,
12
  SemanticSplitterNodeParser,
13
  )
14
+ from service.reader_v3 import upload_file
15
 
16
  # from script.get_topic import extract_topic
17
 
 
24
  def __init__(self, reference, file: UploadFile):
25
  self.file = file
26
  # self.content_table = content_table
27
+ # self.reader = Reader()
28
  self.reference = reference
29
  self.metadata = Metadata(reference)
30
 
31
+ # async def ingest_documents(self, file: UploadFile):
32
+ # """Load documents from the storage path."""
33
+ # documents = await self.reader.read_from_uploadfile(file)
34
+ # print("Banyak document : ", len(documents))
35
+ # print("document successfully ingested")
36
+
37
+ # return documents
38
 
 
 
39
  def check_existing_metadata(self, pinecone_index, title, random_vector):
40
  try:
41
  result = pinecone_index.query(
 
54
 
55
  async def process_documents(self):
56
  # Ingest documents
57
+ # documents = await self.ingest_documents(self.file)
 
 
 
 
58
 
59
  # Get metadata
60
+ # documents_with_metadata = self.metadata.apply_metadata(documents)
61
+ documents_with_metadata = await upload_file(self.reference, self.file)
62
+
63
+ # Get Topic
64
+ # topic_extractor = extract_topic(self.reference, self.content_table)
65
  # document_filtered = self.filter_document(documents_with_metadata)
66
 
67
+ embed_model = OpenAIEmbedding()
68
  # Set up the ingestion pipeline
69
  pipeline = IngestionPipeline(
70
  transformations=[
 
76
  # topic_extractor,
77
  ]
78
  )
79
+
80
  # splitter = SemanticSplitterNodeParser(
81
  # buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
82
  # )
 
86
  nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
87
  # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
88
  return nodes_with_metadata
89
+
90
  except Exception as e:
91
  try:
92
  # If the first method fails, fallback to sentence splitter
93
  sentence_splitter = SentenceSplitter(chunk_size=512)
94
+ nodes_with_metadata = sentence_splitter.get_nodes_from_documents(
95
+ documents_with_metadata
96
+ )
97
  print("Pipeline processing completed with SentenceSplitter fallback.")
98
  return nodes_with_metadata
99
  except Exception as fallback_error:
 
103
  status_code=500,
104
  content="An internal server error occurred during pipeline processing.",
105
  )
106
+
107
  def filter_document(self, documents):
108
  api_key = PINECONE_CONFIG.PINECONE_API_KEY
109
  client = Pinecone(api_key=api_key)
 
120
  if len(result) == 0:
121
  filtered_documents.append(doc)
122
 
123
+ return filtered_documents
service/reader_v3.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nest_asyncio
3
+
4
+ from llama_parse import LlamaParse
5
+ from llama_index.core.node_parser import SimpleNodeParser
6
+ from dotenv import load_dotenv
7
+ from fastapi import UploadFile, File
8
+ from fastapi.responses import JSONResponse
9
+ import fitz
10
+
11
+ from script.get_metadata import Metadata
12
+
13
+ load_dotenv()
14
+ nest_asyncio.apply()
15
+
16
+
17
+ def parse_journal(content: bytes, file_name: str):
18
+ """Parse the journal using LlamaParse."""
19
+ try:
20
+ # Initialize the parser
21
+ parser = LlamaParse(
22
+ api_key=os.getenv("LLAMA_PARSE_API_KEY"),
23
+ result_type="markdown",
24
+ use_vendor_multimodal_model=True,
25
+ vendor_multimodal_model_name="openai-gpt-4o-mini",
26
+ )
27
+
28
+ # Load and process the document
29
+ llama_parse_documents = parser.load_data(
30
+ content, extra_info={"file_name": file_name}
31
+ )
32
+
33
+ return llama_parse_documents
34
+
35
+ except Exception as e:
36
+ return JSONResponse(status_code=400, content=f"Error processing file: {e}")
37
+
38
+
39
+ async def extract_metadata(content: bytes):
40
+ """Extract metadata from the PDF content."""
41
+ try:
42
+ # Open the binary content with PyMuPDF
43
+ pdf_document = fitz.open("pdf", content) # "pdf" specifies the format
44
+
45
+ # Extract metadata
46
+ metadata = pdf_document.metadata
47
+
48
+ # Prepare metadata dictionary with default values for missing fields
49
+ metadata_dict = {
50
+ "title": metadata.get("title", "N/A"),
51
+ "author": metadata.get("author", "N/A"),
52
+ "subject": metadata.get("subject", "N/A"),
53
+ "keywords": metadata.get("keywords", "N/A"),
54
+ "creation_date": metadata.get("created", "N/A"),
55
+ "modification_date": metadata.get("modified", "N/A"),
56
+ }
57
+
58
+ return metadata_dict
59
+
60
+ except Exception as e:
61
+ return JSONResponse(status_code=500, content=f"Error inputting metadata: {e}")
62
+
63
+
64
+ async def upload_file(reference, file: UploadFile):
65
+ try:
66
+ # Read the binary content of the uploaded file once
67
+ content = await file.read()
68
+ # Parse the journal
69
+ parsed_documents = parse_journal(content, file.filename)
70
+ # Extract metadata
71
+ # metadata_dict = await extract_metadata(content)
72
+ # print("Metadata Dictionary : \n\n", metadata_dict)
73
+
74
+ metadata_gen = Metadata(reference)
75
+ documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
76
+
77
+ # document_with_metadata =
78
+
79
+ print("Document with Metadata : \n\n", documents_with_metadata)
80
+ print("Banyak documents : \n", len(documents_with_metadata))
81
+
82
+ # Return both parsed documents and metadata
83
+ return documents_with_metadata
84
+
85
+ except Exception as e:
86
+ return JSONResponse(status_code=500, content=f"Error processing file: {e}")
utils/error_handlers.py CHANGED
@@ -9,12 +9,12 @@ def handle_exception(e: Exception):
9
 
10
 
11
  def handle_error(e, message):
12
- return JSONResponse(status_code=500, content={"error": f"{message}: {str(e)}"})
13
 
14
 
15
  def not_found_error(message):
16
- return JSONResponse(status_code=404, content={"error": message})
17
 
18
 
19
  def no_entries_found(message):
20
- return JSONResponse(status_code=404, content={"message": message})
 
9
 
10
 
11
  def handle_error(e, message):
12
+ return JSONResponse(status_code=500, content={"error occurs": f"{message}: {str(e)}"})
13
 
14
 
15
  def not_found_error(message):
16
+ return JSONResponse(status_code=404, content={"not found message": message})
17
 
18
 
19
  def no_entries_found(message):
20
+ return JSONResponse(status_code=404, content={"no entries found": message})