Arafath10 commited on
Commit
81a3473
·
verified ·
1 Parent(s): 02fdaae

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -122
main.py CHANGED
@@ -1,13 +1,20 @@
1
- from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
- import requests
5
- from AWSClaude import AWSClaude
6
- import json
7
- import concurrent.futures
8
- import time
 
 
 
 
 
 
9
 
10
  app = FastAPI()
 
11
  app.add_middleware(
12
  CORSMiddleware,
13
  allow_origins=["*"],
@@ -16,127 +23,40 @@ app.add_middleware(
16
  allow_headers=["*"],
17
  )
18
 
 
 
 
 
 
 
19
 
 
 
20
 
21
- @app.post("/get_n_depth_results")
22
- async def get_n_depth_results(url,input_query):
23
- # Start time
24
- start_time = time.time()
25
- all_content = {}
26
-
27
- def add_pdf_content(selected_pdf):
28
- for pdf_url in selected_pdf:
29
- print(pdf_url)
30
- response = requests.get(pdf_url)
31
-
32
- # Save the content of the response as a PDF file
33
- pdf_path = "temp.pdf"
34
- with open(pdf_path, "wb") as file:
35
- file.write(response.content)
36
-
37
- print(f"PDF file saved as {pdf_path}")
38
-
39
- url = "http://localhost:5000/ask"
40
- # url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2"
41
-
42
- data = {"processTables": "True"}
43
-
44
- headers = {"Origin": "http://localhost:8080"}
45
-
46
- with open(pdf_path, "rb") as file:
47
- file_contents = file.read()
48
-
49
- files = {
50
- "pdf": (
51
- pdf_path,
52
- file_contents,
53
- "application/pdf",
54
- )
55
- }
56
-
57
- response = requests.post(url, files=files, data=data, headers=headers)
58
- all_content[pdf_url] = response.json()
59
-
60
- def scrapper(input_url):
61
- params = {'url': input_url}
62
- headers = {'accept': 'application/json'}
63
- url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data'
64
- try:
65
- response = requests.get(url, headers=headers, params=params)
66
- all_url = response.json()["URL"]
67
- all_content[input_url] = response.json()["Content"]
68
- return all_url
69
- except:
70
- print(f"found a error url : {input_url}=========================================")
71
- return "none"
72
-
73
- pdf_urls = []
74
-
75
- def separate_pdf_and_nonPDF_links(urls):
76
- # Separate URLs into two lists
77
- pdf_links = [url for url in urls if url and url.endswith('.pdf')]
78
- if pdf_links:
79
- pdf_urls.append(pdf_links)
80
- return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping
81
-
82
- def call_llm_service(scraped_data, input_url, input_query, pdf):
83
- query = f"""
84
- Here are my scraped links:
85
-
86
- {scraped_data}
87
-
88
- correct hostname: {input_url} use this host name for all other tasks
89
 
90
- I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output
91
- example json format is only links don't include keys (i need the always full (www.hotname.com/../))
92
- """
93
- llm = "ClaudeHaiku"
94
- env = ""
95
- user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA"
96
- thread_id = "hKxvoVgi7vRJCHhvMzH5"
97
- stream_id = "stream1"
98
- app_type = "sentinel"
99
- other_request_params = {"messages": [
100
- {"role": "user", "content": query},
101
- ]}
102
- return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke()
103
-
104
- input_url = f'["{url}"]'
105
- input_query = input_query
106
 
107
- for step in range(1, 3):
108
- print(f"=================={step} step of scraping to get selected URLs from LLM=================================")
109
- next_urls = []
110
- with concurrent.futures.ThreadPoolExecutor() as executor:
111
- futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])]
112
- for future in concurrent.futures.as_completed(futures):
113
- next_urls.append(separate_pdf_and_nonPDF_links(future.result()))
114
- if step==2:
115
- break
116
- selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "")
117
- input_url = selected_links_from_llm
118
- print(json.loads(input_url)[:2])
119
 
120
- # End time
121
- end_time = time.time()
122
-
123
- # Calculate the time taken
124
- time_taken = end_time - start_time
125
-
126
- print(f"Time taken: {time_taken} seconds")
127
- if not pdf_urls:
128
- print(pdf_urls)
129
- #return all_content.keys()
130
- return all_content
131
- else:
132
- #selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension"))
133
- print(pdf_urls)
134
- #print("selected pdf")
135
- #print(selected_pdf)
136
- #return all_content.keys()
137
- return all_content
138
-
139
-
140
 
141
  #print(main("https://www.keells.com/", "Please analyse reports"))
142
 
 
1
+ from fastapi import FastAPI, HTTPException, Request
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
+
5
+
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ import numpy as np
8
+
9
+ # Load the tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained("Arafath10/reference_page_finder")
11
+
12
+ # Load the model
13
+ model = AutoModelForSequenceClassification.from_pretrained("Arafath10/reference_page_finder")
14
+
15
 
16
  app = FastAPI()
17
+
18
  app.add_middleware(
19
  CORSMiddleware,
20
  allow_origins=["*"],
 
23
  allow_headers=["*"],
24
  )
25
 
26
+ @app.post("/find_refrence_page")
27
+ async def find_refrence_page(request: Request):
28
+ try:
29
+ # Extract the JSON body
30
+ body = await request.json()
31
+ test_text = body.get("text")
32
 
33
+ if not test_text:
34
+ raise HTTPException(status_code=400, detail="Missing 'text' field in request body")
35
 
36
+ import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Remove all types of extra whitespace (spaces, tabs, newlines)
39
+ test_text = re.sub(r'\s+', ' ', test_text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def chunk_string(input_string, chunk_size):
42
+ return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]
 
 
 
 
 
 
 
 
 
 
43
 
44
+ chunks = chunk_string(test_text, chunk_size=512)
45
+ chunks = reversed(chunks)
46
+ # Output the chunks
47
+ flag = "no reference found"
48
+ for idx, chunk in enumerate(chunks):
49
+ print(f"Chunk {idx + 1} {chunk}")
50
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding="max_length")
51
+ outputs = model(**inputs)
52
+ predictions = np.argmax(outputs.logits.detach().numpy(), axis=-1)
53
+ #print("Prediction:", "yes reference found" if predictions[0] == 1 else "no reference found")
54
+ if predictions[0] == 1:
55
+ flag = "yes reference found"
56
+ break
57
+ return flag
58
+ except:
59
+ return "error"
 
 
 
 
60
 
61
  #print(main("https://www.keells.com/", "Please analyse reports"))
62