eljanmahammadli commited on
Commit
59fbf6a
·
1 Parent(s): 8bd7fd1

changed split logic to resolve short generated text, more search website and some logging

Browse files
Files changed (3) hide show
  1. ai_generate.py +14 -3
  2. app.py +6 -9
  3. google_search.py +20 -3
ai_generate.py CHANGED
@@ -17,7 +17,7 @@ from langchain_community.embeddings.sentence_transformer import (
17
  )
18
  from langchain.schema import StrOutputParser
19
  from langchain_community.vectorstores import Chroma
20
- from langchain_text_splitters import CharacterTextSplitter
21
  from langchain import hub
22
  from langchain_core.output_parsers import StrOutputParser
23
  from langchain_core.runnables import RunnablePassthrough, RunnableMap
@@ -44,10 +44,17 @@ vertexai.init(project="proprietary-info-detection", location="us-central1")
44
  gemini_client = GenerativeModel("gemini-1.5-pro-001")
45
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
46
 
 
47
  # For GPT-4 1 word is about 1.3 tokens.
48
  temperature = 1.0
49
  max_tokens = 2048
50
 
 
 
 
 
 
 
51
  llm_model_translation = {
52
  "LLaMA 3": "llama3-70b-8192",
53
  "OpenAI GPT 4o Mini": "gpt-4o-mini",
@@ -82,7 +89,7 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
82
 
83
  def create_db_with_langchain(path: list[str], url_content: dict):
84
  all_docs = []
85
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
86
  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
87
  if path:
88
  for file in path:
@@ -99,6 +106,10 @@ def create_db_with_langchain(path: list[str], url_content: dict):
99
  docs = text_splitter.split_documents([doc])
100
  all_docs.extend(docs)
101
 
 
 
 
 
102
  assert len(all_docs) > 0, "No PDFs or scrapped data provided"
103
  db = Chroma.from_documents(all_docs, embedding_function)
104
  return db
@@ -120,7 +131,7 @@ def generate_rag(
120
  print("Failed to load LLM. Aborting operation.")
121
  return None
122
  db = create_db_with_langchain(path, url_content)
123
- retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
124
  rag_prompt = hub.pull("rlm/rag-prompt")
125
 
126
  def format_docs(docs):
 
17
  )
18
  from langchain.schema import StrOutputParser
19
  from langchain_community.vectorstores import Chroma
20
+ from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
21
  from langchain import hub
22
  from langchain_core.output_parsers import StrOutputParser
23
  from langchain_core.runnables import RunnablePassthrough, RunnableMap
 
44
  gemini_client = GenerativeModel("gemini-1.5-pro-001")
45
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
46
 
47
+ # LLM params
48
  # For GPT-4 1 word is about 1.3 tokens.
49
  temperature = 1.0
50
  max_tokens = 2048
51
 
52
+ # RAG params
53
+ CHUNK_SIZE = 1024
54
+ CHUNK_OVERLAP = CHUNK_SIZE // 8
55
+ K = 10
56
+ FETCH_K = 20
57
+
58
  llm_model_translation = {
59
  "LLaMA 3": "llama3-70b-8192",
60
  "OpenAI GPT 4o Mini": "gpt-4o-mini",
 
89
 
90
  def create_db_with_langchain(path: list[str], url_content: dict):
91
  all_docs = []
92
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
93
  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
94
  if path:
95
  for file in path:
 
106
  docs = text_splitter.split_documents([doc])
107
  all_docs.extend(docs)
108
 
109
+ # print docs
110
+ for idx, doc in enumerate(all_docs):
111
+ print(f"Doc: {idx} | Length = {len(doc.page_content)}")
112
+
113
  assert len(all_docs) > 0, "No PDFs or scrapped data provided"
114
  db = Chroma.from_documents(all_docs, embedding_function)
115
  return db
 
131
  print("Failed to load LLM. Aborting operation.")
132
  return None
133
  db = create_db_with_langchain(path, url_content)
134
+ retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
135
  rag_prompt = hub.pull("rlm/rag-prompt")
136
 
137
  def format_docs(docs):
app.py CHANGED
@@ -435,7 +435,7 @@ def generate_and_format(
435
  ):
436
  content_string = ""
437
  url_content = None
438
- ai_model = "Claude Sonnet 3.5"
439
  if google_search_check:
440
  date_from = build_date(year_from, month_from, day_from)
441
  date_to = build_date(year_to, month_to, day_to)
@@ -637,7 +637,7 @@ def create_interface():
637
  google_default = False
638
  with gr.Row():
639
  google_search_check = gr.Checkbox(
640
- label="Enable Google Search For Recent Sources", value=google_default
641
  )
642
  with gr.Group(visible=google_default) as search_options:
643
  with gr.Row():
@@ -682,7 +682,7 @@ def create_interface():
682
  gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
683
  pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
684
 
685
- # HIDE AI MODEL SELECTION
686
  # with gr.Group():
687
  # gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
688
  # ai_generator = gr.Dropdown(
@@ -710,18 +710,15 @@ def create_interface():
710
  "Base Model",
711
  "Large Model",
712
  "XL Model",
713
- # "XL Law Model",
714
- # "XL Marketing Model",
715
- # "XL Child Style Model",
716
  ],
717
- value="Large Model",
718
  label="Humanizer Model Version",
719
  )
720
  with gr.Row():
721
  temperature_slider = gr.Slider(
722
- minimum=0.5, maximum=2.0, step=0.1, value=1.3, label="Temperature"
723
  )
724
- top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=50, label="Top k")
725
  with gr.Row():
726
  repetition_penalty_slider = gr.Slider(
727
  minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
 
435
  ):
436
  content_string = ""
437
  url_content = None
438
+ ai_model = "OpenAI GPT 4o"
439
  if google_search_check:
440
  date_from = build_date(year_from, month_from, day_from)
441
  date_to = build_date(year_to, month_to, day_to)
 
637
  google_default = False
638
  with gr.Row():
639
  google_search_check = gr.Checkbox(
640
+ label="Enable Internet Search For Recent Sources", value=google_default
641
  )
642
  with gr.Group(visible=google_default) as search_options:
643
  with gr.Row():
 
682
  gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
683
  pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
684
 
685
+ # NOTE: HIDE AI MODEL SELECTION
686
  # with gr.Group():
687
  # gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
688
  # ai_generator = gr.Dropdown(
 
710
  "Base Model",
711
  "Large Model",
712
  "XL Model",
 
 
 
713
  ],
714
+ value="XL Model",
715
  label="Humanizer Model Version",
716
  )
717
  with gr.Row():
718
  temperature_slider = gr.Slider(
719
+ minimum=0.5, maximum=2.0, step=0.1, value=1.1, label="Temperature"
720
  )
721
+ top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
722
  with gr.Row():
723
  repetition_penalty_slider = gr.Slider(
724
  minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
google_search.py CHANGED
@@ -29,21 +29,38 @@ def clean_html(text):
29
 
30
 
31
  def build_results_beautifulsoup(url_list):
32
- # Scrape URLs in list
33
  start_time = time.perf_counter()
 
 
34
  soups = asyncio.run(parallel_scrap(url_list))
35
- print("Scraping processing time: ", time.perf_counter() - start_time)
 
 
 
36
  result_content = {}
37
- num_pages = 3
38
  count = 0
 
 
39
  for url, soup in zip(url_list, soups):
40
  if count >= num_pages:
 
41
  break
 
42
  if soup:
 
43
  text = clean_html(soup.text)
44
  if len(text) > 500:
 
45
  result_content[url] = text
46
  count += 1
 
 
 
 
 
 
47
  return result_content
48
 
49
 
 
29
 
30
 
31
  def build_results_beautifulsoup(url_list):
32
+ print("Starting to scrape URLs...")
33
  start_time = time.perf_counter()
34
+
35
+ # scrape URLs in list
36
  soups = asyncio.run(parallel_scrap(url_list))
37
+
38
+ scraping_time = time.perf_counter() - start_time
39
+ print(f"Scraping processing time: {scraping_time:.2f} seconds")
40
+
41
  result_content = {}
42
+ num_pages = 10
43
  count = 0
44
+
45
+ print("Starting to process each URL...")
46
  for url, soup in zip(url_list, soups):
47
  if count >= num_pages:
48
+ print(f"Reached the limit of {num_pages} pages. Stopping processing.")
49
  break
50
+
51
  if soup:
52
+ print(f"Processing URL: {url}")
53
  text = clean_html(soup.text)
54
  if len(text) > 500:
55
+ print(f"Adding content from URL: {url}, content length: {len(text)}")
56
  result_content[url] = text
57
  count += 1
58
+ else:
59
+ print(f"Skipped URL: {url}, content too short (length: {len(text)})")
60
+ else:
61
+ print(f"Skipped URL: {url}, no soup content available.")
62
+
63
+ print("Finished processing URLs.")
64
  return result_content
65
 
66