Spaces:
Sleeping
Sleeping
add json checker
Browse files- .gitignore +5 -0
- app.py +43 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dev.ipynb
|
2 |
+
.env
|
3 |
+
app-test.py
|
4 |
+
dev copie.ipynb
|
5 |
+
output.csv
|
app.py
CHANGED
@@ -6,6 +6,8 @@ from langchain_community.llms import HuggingFaceEndpoint
|
|
6 |
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
7 |
import gradio as gr
|
8 |
import subprocess
|
|
|
|
|
9 |
|
10 |
# Ensure Playwright installs required browsers and dependencies
|
11 |
subprocess.run(["playwright", "install"])
|
@@ -34,6 +36,29 @@ graph_config = {
|
|
34 |
},
|
35 |
"embeddings": {"model_instance": embedder_model_instance}
|
36 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def scrape_and_summarize(prompt, source):
|
39 |
smart_scraper_graph = SmartScraperGraph(
|
@@ -42,9 +67,27 @@ def scrape_and_summarize(prompt, source):
|
|
42 |
config=graph_config
|
43 |
)
|
44 |
result = smart_scraper_graph.run()
|
|
|
|
|
|
|
|
|
|
|
45 |
exec_info = smart_scraper_graph.get_execution_info()
|
46 |
return result, prettify_exec_info(exec_info)
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
# Gradio interface
|
49 |
with gr.Blocks() as demo:
|
50 |
gr.Markdown("# Scrape websites, no-code version")
|
|
|
6 |
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
7 |
import gradio as gr
|
8 |
import subprocess
|
9 |
+
import json
|
10 |
+
import re
|
11 |
|
12 |
# Ensure Playwright installs required browsers and dependencies
|
13 |
subprocess.run(["playwright", "install"])
|
|
|
36 |
},
|
37 |
"embeddings": {"model_instance": embedder_model_instance}
|
38 |
}
|
39 |
+
#######
|
40 |
+
def clean_json_string(json_str):
|
41 |
+
"""
|
42 |
+
Removes any comments or prefixes before the actual JSON content.
|
43 |
+
Returns the cleaned JSON string.
|
44 |
+
"""
|
45 |
+
# Find the first occurrence of '{'
|
46 |
+
json_start = json_str.find('{')
|
47 |
+
if json_start == -1:
|
48 |
+
# If no '{' is found, try with '[' for arrays
|
49 |
+
json_start = json_str.find('[')
|
50 |
+
if json_start == -1:
|
51 |
+
return json_str # Return original if no JSON markers found
|
52 |
+
|
53 |
+
# Extract everything from the first JSON marker
|
54 |
+
cleaned_str = json_str[json_start:]
|
55 |
+
|
56 |
+
# Verify it's valid JSON
|
57 |
+
try:
|
58 |
+
json.loads(cleaned_str)
|
59 |
+
return cleaned_str
|
60 |
+
except json.JSONDecodeError:
|
61 |
+
return json_str # Return original if cleaning results in invalid JSON
|
62 |
|
63 |
def scrape_and_summarize(prompt, source):
|
64 |
smart_scraper_graph = SmartScraperGraph(
|
|
|
67 |
config=graph_config
|
68 |
)
|
69 |
result = smart_scraper_graph.run()
|
70 |
+
|
71 |
+
# Clean the result if it's a string
|
72 |
+
if isinstance(result, str):
|
73 |
+
result = clean_json_string(result)
|
74 |
+
|
75 |
exec_info = smart_scraper_graph.get_execution_info()
|
76 |
return result, prettify_exec_info(exec_info)
|
77 |
|
78 |
+
|
79 |
+
|
80 |
+
#######
|
81 |
+
# def scrape_and_summarize(prompt, source):
|
82 |
+
# smart_scraper_graph = SmartScraperGraph(
|
83 |
+
# prompt=prompt,
|
84 |
+
# source=source,
|
85 |
+
# config=graph_config
|
86 |
+
# )
|
87 |
+
# result = smart_scraper_graph.run()
|
88 |
+
# exec_info = smart_scraper_graph.get_execution_info()
|
89 |
+
# return result, prettify_exec_info(exec_info)
|
90 |
+
|
91 |
# Gradio interface
|
92 |
with gr.Blocks() as demo:
|
93 |
gr.Markdown("# Scrape websites, no-code version")
|