Spaces:
Runtime error
Runtime error
Commit
·
59fbf6a
1
Parent(s):
8bd7fd1
changed split logic to resolve short generated text, more search website and some logging
Browse files- ai_generate.py +14 -3
- app.py +6 -9
- google_search.py +20 -3
ai_generate.py
CHANGED
@@ -17,7 +17,7 @@ from langchain_community.embeddings.sentence_transformer import (
|
|
17 |
)
|
18 |
from langchain.schema import StrOutputParser
|
19 |
from langchain_community.vectorstores import Chroma
|
20 |
-
from langchain_text_splitters import CharacterTextSplitter
|
21 |
from langchain import hub
|
22 |
from langchain_core.output_parsers import StrOutputParser
|
23 |
from langchain_core.runnables import RunnablePassthrough, RunnableMap
|
@@ -44,10 +44,17 @@ vertexai.init(project="proprietary-info-detection", location="us-central1")
|
|
44 |
gemini_client = GenerativeModel("gemini-1.5-pro-001")
|
45 |
claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
|
46 |
|
|
|
47 |
# For GPT-4 1 word is about 1.3 tokens.
|
48 |
temperature = 1.0
|
49 |
max_tokens = 2048
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
llm_model_translation = {
|
52 |
"LLaMA 3": "llama3-70b-8192",
|
53 |
"OpenAI GPT 4o Mini": "gpt-4o-mini",
|
@@ -82,7 +89,7 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
|
|
82 |
|
83 |
def create_db_with_langchain(path: list[str], url_content: dict):
|
84 |
all_docs = []
|
85 |
-
text_splitter =
|
86 |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
87 |
if path:
|
88 |
for file in path:
|
@@ -99,6 +106,10 @@ def create_db_with_langchain(path: list[str], url_content: dict):
|
|
99 |
docs = text_splitter.split_documents([doc])
|
100 |
all_docs.extend(docs)
|
101 |
|
|
|
|
|
|
|
|
|
102 |
assert len(all_docs) > 0, "No PDFs or scrapped data provided"
|
103 |
db = Chroma.from_documents(all_docs, embedding_function)
|
104 |
return db
|
@@ -120,7 +131,7 @@ def generate_rag(
|
|
120 |
print("Failed to load LLM. Aborting operation.")
|
121 |
return None
|
122 |
db = create_db_with_langchain(path, url_content)
|
123 |
-
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k":
|
124 |
rag_prompt = hub.pull("rlm/rag-prompt")
|
125 |
|
126 |
def format_docs(docs):
|
|
|
17 |
)
|
18 |
from langchain.schema import StrOutputParser
|
19 |
from langchain_community.vectorstores import Chroma
|
20 |
+
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
21 |
from langchain import hub
|
22 |
from langchain_core.output_parsers import StrOutputParser
|
23 |
from langchain_core.runnables import RunnablePassthrough, RunnableMap
|
|
|
44 |
gemini_client = GenerativeModel("gemini-1.5-pro-001")
|
45 |
claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
|
46 |
|
47 |
+
# LLM params
|
48 |
# For GPT-4 1 word is about 1.3 tokens.
|
49 |
temperature = 1.0
|
50 |
max_tokens = 2048
|
51 |
|
52 |
+
# RAG params
|
53 |
+
CHUNK_SIZE = 1024
|
54 |
+
CHUNK_OVERLAP = CHUNK_SIZE // 8
|
55 |
+
K = 10
|
56 |
+
FETCH_K = 20
|
57 |
+
|
58 |
llm_model_translation = {
|
59 |
"LLaMA 3": "llama3-70b-8192",
|
60 |
"OpenAI GPT 4o Mini": "gpt-4o-mini",
|
|
|
89 |
|
90 |
def create_db_with_langchain(path: list[str], url_content: dict):
|
91 |
all_docs = []
|
92 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
93 |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
94 |
if path:
|
95 |
for file in path:
|
|
|
106 |
docs = text_splitter.split_documents([doc])
|
107 |
all_docs.extend(docs)
|
108 |
|
109 |
+
# print docs
|
110 |
+
for idx, doc in enumerate(all_docs):
|
111 |
+
print(f"Doc: {idx} | Length = {len(doc.page_content)}")
|
112 |
+
|
113 |
assert len(all_docs) > 0, "No PDFs or scrapped data provided"
|
114 |
db = Chroma.from_documents(all_docs, embedding_function)
|
115 |
return db
|
|
|
131 |
print("Failed to load LLM. Aborting operation.")
|
132 |
return None
|
133 |
db = create_db_with_langchain(path, url_content)
|
134 |
+
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
|
135 |
rag_prompt = hub.pull("rlm/rag-prompt")
|
136 |
|
137 |
def format_docs(docs):
|
app.py
CHANGED
@@ -435,7 +435,7 @@ def generate_and_format(
|
|
435 |
):
|
436 |
content_string = ""
|
437 |
url_content = None
|
438 |
-
ai_model = "
|
439 |
if google_search_check:
|
440 |
date_from = build_date(year_from, month_from, day_from)
|
441 |
date_to = build_date(year_to, month_to, day_to)
|
@@ -637,7 +637,7 @@ def create_interface():
|
|
637 |
google_default = False
|
638 |
with gr.Row():
|
639 |
google_search_check = gr.Checkbox(
|
640 |
-
label="Enable
|
641 |
)
|
642 |
with gr.Group(visible=google_default) as search_options:
|
643 |
with gr.Row():
|
@@ -682,7 +682,7 @@ def create_interface():
|
|
682 |
gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
|
683 |
pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
|
684 |
|
685 |
-
# HIDE AI MODEL SELECTION
|
686 |
# with gr.Group():
|
687 |
# gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
688 |
# ai_generator = gr.Dropdown(
|
@@ -710,18 +710,15 @@ def create_interface():
|
|
710 |
"Base Model",
|
711 |
"Large Model",
|
712 |
"XL Model",
|
713 |
-
# "XL Law Model",
|
714 |
-
# "XL Marketing Model",
|
715 |
-
# "XL Child Style Model",
|
716 |
],
|
717 |
-
value="
|
718 |
label="Humanizer Model Version",
|
719 |
)
|
720 |
with gr.Row():
|
721 |
temperature_slider = gr.Slider(
|
722 |
-
minimum=0.5, maximum=2.0, step=0.1, value=1.
|
723 |
)
|
724 |
-
top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=
|
725 |
with gr.Row():
|
726 |
repetition_penalty_slider = gr.Slider(
|
727 |
minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
|
|
|
435 |
):
|
436 |
content_string = ""
|
437 |
url_content = None
|
438 |
+
ai_model = "OpenAI GPT 4o"
|
439 |
if google_search_check:
|
440 |
date_from = build_date(year_from, month_from, day_from)
|
441 |
date_to = build_date(year_to, month_to, day_to)
|
|
|
637 |
google_default = False
|
638 |
with gr.Row():
|
639 |
google_search_check = gr.Checkbox(
|
640 |
+
label="Enable Internet Search For Recent Sources", value=google_default
|
641 |
)
|
642 |
with gr.Group(visible=google_default) as search_options:
|
643 |
with gr.Row():
|
|
|
682 |
gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
|
683 |
pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
|
684 |
|
685 |
+
# NOTE: HIDE AI MODEL SELECTION
|
686 |
# with gr.Group():
|
687 |
# gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
688 |
# ai_generator = gr.Dropdown(
|
|
|
710 |
"Base Model",
|
711 |
"Large Model",
|
712 |
"XL Model",
|
|
|
|
|
|
|
713 |
],
|
714 |
+
value="XL Model",
|
715 |
label="Humanizer Model Version",
|
716 |
)
|
717 |
with gr.Row():
|
718 |
temperature_slider = gr.Slider(
|
719 |
+
minimum=0.5, maximum=2.0, step=0.1, value=1.1, label="Temperature"
|
720 |
)
|
721 |
+
top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
|
722 |
with gr.Row():
|
723 |
repetition_penalty_slider = gr.Slider(
|
724 |
minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
|
google_search.py
CHANGED
@@ -29,21 +29,38 @@ def clean_html(text):
|
|
29 |
|
30 |
|
31 |
def build_results_beautifulsoup(url_list):
|
32 |
-
|
33 |
start_time = time.perf_counter()
|
|
|
|
|
34 |
soups = asyncio.run(parallel_scrap(url_list))
|
35 |
-
|
|
|
|
|
|
|
36 |
result_content = {}
|
37 |
-
num_pages =
|
38 |
count = 0
|
|
|
|
|
39 |
for url, soup in zip(url_list, soups):
|
40 |
if count >= num_pages:
|
|
|
41 |
break
|
|
|
42 |
if soup:
|
|
|
43 |
text = clean_html(soup.text)
|
44 |
if len(text) > 500:
|
|
|
45 |
result_content[url] = text
|
46 |
count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
return result_content
|
48 |
|
49 |
|
|
|
29 |
|
30 |
|
31 |
def build_results_beautifulsoup(url_list):
|
32 |
+
print("Starting to scrape URLs...")
|
33 |
start_time = time.perf_counter()
|
34 |
+
|
35 |
+
# scrape URLs in list
|
36 |
soups = asyncio.run(parallel_scrap(url_list))
|
37 |
+
|
38 |
+
scraping_time = time.perf_counter() - start_time
|
39 |
+
print(f"Scraping processing time: {scraping_time:.2f} seconds")
|
40 |
+
|
41 |
result_content = {}
|
42 |
+
num_pages = 10
|
43 |
count = 0
|
44 |
+
|
45 |
+
print("Starting to process each URL...")
|
46 |
for url, soup in zip(url_list, soups):
|
47 |
if count >= num_pages:
|
48 |
+
print(f"Reached the limit of {num_pages} pages. Stopping processing.")
|
49 |
break
|
50 |
+
|
51 |
if soup:
|
52 |
+
print(f"Processing URL: {url}")
|
53 |
text = clean_html(soup.text)
|
54 |
if len(text) > 500:
|
55 |
+
print(f"Adding content from URL: {url}, content length: {len(text)}")
|
56 |
result_content[url] = text
|
57 |
count += 1
|
58 |
+
else:
|
59 |
+
print(f"Skipped URL: {url}, content too short (length: {len(text)})")
|
60 |
+
else:
|
61 |
+
print(f"Skipped URL: {url}, no soup content available.")
|
62 |
+
|
63 |
+
print("Finished processing URLs.")
|
64 |
return result_content
|
65 |
|
66 |
|