om4r932 commited on
Commit
7f4e86d
·
1 Parent(s): a5c3314

Add already indexed doc handling

Browse files
Files changed (1) hide show
  1. app.py +32 -8
app.py CHANGED
@@ -1,16 +1,14 @@
1
- import requests
2
- import os
3
- import zipfile
4
  from io import BytesIO
5
- import subprocess
6
- import os
7
- import re
8
- import warnings
9
  from fastapi import FastAPI, HTTPException
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from pydantic import BaseModel
12
 
13
- warnings.filterwarnings("ignore")
14
 
15
  app = FastAPI(title="3GPP Specification Splitter API",
16
  description="API to split and display specifications by their chapters & sub-chapters",
@@ -28,12 +26,36 @@ app.add_middleware(
28
  allow_headers=["*"],
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class SpecRequest(BaseModel):
32
  spec_id: str
33
 
34
  @app.post("/get_full_text")
35
  def get_text(request: SpecRequest):
36
  specification = request.spec_id
 
 
 
37
  total_file = []
38
  url = requests.post(
39
  "https://organizedprogrammers-3gppdocfinder.hf.space/find",
@@ -92,6 +114,8 @@ def get_text(request: SpecRequest):
92
 
93
  @app.post("/get_spec_content")
94
  def get_spec_content(request: SpecRequest):
 
 
95
  text = get_text(request)
96
  chapters = []
97
  chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
 
1
+ import requests, os, zipfile, subprocess, re, warnings
2
+ warnings.filterwarnings("ignore")
 
3
  from io import BytesIO
4
+ from dotenv import load_dotenv
5
+ from datasets import load_dataset
6
+ from huggingface_hub import login
 
7
  from fastapi import FastAPI, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel
10
 
11
+ load_dotenv()
12
 
13
  app = FastAPI(title="3GPP Specification Splitter API",
14
  description="API to split and display specifications by their chapters & sub-chapters",
 
26
  allow_headers=["*"],
27
  )
28
 
29
+ spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
30
+ spec_contents = spec_contents["train"].to_list()
31
+
32
+ def is_doc_indexed(spec_id: str):
33
+ return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
34
+
35
+ def get_full_doc(spec_id: str):
36
+ doc = []
37
+ for spec in spec_contents:
38
+ if spec["doc_id"] == spec_id:
39
+ doc.append(f"{spec['section']}\n{spec['content']}")
40
+ return "\n\n".join(doc)
41
+
42
+ def get_structured_doc(spec_id: str):
43
+ doc = {}
44
+ for spec in spec_contents:
45
+ if spec["doc_id"] == spec_id:
46
+ doc[spec["section"]] = spec["content"]
47
+ return doc
48
+
49
+
50
  class SpecRequest(BaseModel):
51
  spec_id: str
52
 
53
  @app.post("/get_full_text")
54
  def get_text(request: SpecRequest):
55
  specification = request.spec_id
56
+ if is_doc_indexed(specification):
57
+ return get_full_doc(specification)
58
+ print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
59
  total_file = []
60
  url = requests.post(
61
  "https://organizedprogrammers-3gppdocfinder.hf.space/find",
 
114
 
115
  @app.post("/get_spec_content")
116
  def get_spec_content(request: SpecRequest):
117
+ if is_doc_indexed(request.spec_id):
118
+ return get_structured_doc(request.spec_id)
119
  text = get_text(request)
120
  chapters = []
121
  chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")