HuuHuy227 commited on
Commit
d6d5bda
·
1 Parent(s): 7b9f840

new-modified

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -4
  2. app.py +177 -227
  3. requirements.txt +5 -5
  4. utils.py +0 -133
Dockerfile CHANGED
@@ -6,15 +6,12 @@ WORKDIR /app
6
 
7
  # Install system dependencies for cairosvg
8
  RUN apt-get update && apt-get install -y \
 
9
  build-essential \
10
  python3-dev \
11
  python3-pip \
12
  python3-setuptools \
13
- libcairo2-dev \
14
  pkg-config \
15
- libcairo2 \
16
- libcairo-gobject2 \
17
- python3-cairo \
18
  libpango1.0-dev \
19
  shared-mime-info \
20
  mime-support \
 
6
 
7
  # Install system dependencies for cairosvg
8
  RUN apt-get update && apt-get install -y \
9
+ graphviz \
10
  build-essential \
11
  python3-dev \
12
  python3-pip \
13
  python3-setuptools \
 
14
  pkg-config \
 
 
 
15
  libpango1.0-dev \
16
  shared-mime-info \
17
  mime-support \
app.py CHANGED
@@ -1,246 +1,196 @@
1
  import streamlit as st
2
  import spacy
3
- from spacy import displacy
4
  import pandas as pd
5
- from collections import Counter
6
- import plotly.express as px
7
- from utils import analyze_text
8
- from utils import svg_to_png
9
  import base64
 
 
10
 
11
- # Set page to wide mode for better visualization
12
- st.set_page_config(layout="wide")
13
 
14
- # Load English language model
15
- @st.cache_resource
16
- def load_model():
17
- return spacy.load('en_core_web_md')
 
 
 
 
 
 
 
18
 
19
- nlp = load_model()
20
-
21
- # Streamlit UI
22
- st.markdown("<h1 style='text-align: center; color: white;'>English Sentences Analyzer</h1>", unsafe_allow_html=True)
23
-
24
- # Text Input and Help side by side
25
- col1, col2 = st.columns([3, 1])
26
- with col1:
27
- text_input = st.text_area(
28
- "Enter English text:",
29
- "The ambitious startup in Silicon Valley developed an innovative AI system last year. " +
30
- "Google and Microsoft showed interest in acquiring the technology for $50 million.",
31
- height=200
32
- )
33
- analyze_button = st.button("Analyze Text")
34
-
35
- with col2:
36
- with st.expander("Quick Guide", expanded=True):
37
- st.markdown("""
38
- 1. Enter your text in the input box
39
- 2. Click "Analyze Text" to see:
40
- - Sentence structure visualization
41
- - Detailed token analysis
42
- - Additional analysis in expandable sections
43
- 3. Use mouse wheel or buttons to zoom the visualization
44
- 4. Click and drag to pan around
45
- """)
46
-
47
- if analyze_button:
48
- if text_input:
49
- tokens, entities, noun_chunks, stats, doc = analyze_text(nlp, text_input)
50
-
51
- # 1. Dependency Parse with improved visualization
52
- st.header("Sentence Structure Analysis")
53
-
54
- # Generate sentence visualizations
55
- sentences = list(doc.sents)
56
- sentence_htmls = []
57
- for sent in sentences:
58
- sent_html = displacy.render(sent, style="dep", options={
59
- "distance": 120,
60
- "arrow_stroke": 2,
61
- "arrow_width": 8,
62
- "font": "Arial",
63
- "bg": "#ffffff",
64
  })
65
- # Ensure proper SVG structure
66
- if not sent_html.startswith('<?xml'):
67
- sent_html = '<?xml version="1.0" encoding="UTF-8"?>' + sent_html
68
- sentence_htmls.append(sent_html)
69
-
70
- doc_html = "<br><br>".join(sentence_htmls)
 
 
 
 
 
71
 
72
- # Convert SVG to PNG with error handling
73
- png_bytes = svg_to_png(doc_html)
74
- if png_bytes is None:
75
- st.error("Failed to generate visualization")
76
- else:
77
- png_b64 = base64.b64encode(png_bytes).decode()
78
 
79
- # CSS for image container
80
- st.markdown("""
81
- <style>
82
- .image-container {
83
- position: relative;
84
- overflow: hidden;
85
- background: #b4b4b4;
86
- border: 1px solid #ddd;
87
- border-radius: 5px;
88
- margin: 10px 0;
89
- }
90
- .zoomable-image {
91
- transform-origin: 0 0;
92
- transition: transform 0.1s;
93
- }
94
- .download-btn {
95
- position: absolute;
96
- right: 10px;
97
- top: 10px;
98
- background: rgba(255, 255, 255, 0.8);
99
- border: 1px solid #ddd;
100
- border-radius: 4px;
101
- padding: 5px 10px;
102
- cursor: pointer;
103
- }
104
- .download-btn:hover {
105
- background: white;
106
- }
107
- </style>
108
- """, unsafe_allow_html=True)
109
 
110
- # JavaScript for zoom and pan functionality
111
- js_code = f"""
112
- <div class="image-container" id="imageContainer">
113
- <img src="data:image/png;base64,{png_b64}"
114
- class="zoomable-image"
115
- id="zoomableImage"
116
- style="max-width: 100%;">
117
- <a class="download-btn"
118
- href="data:image/png;base64,{png_b64}"
119
- download="sentence_structure.png">
120
- 📥 Download
121
- </a>
122
- </div>
123
- <script>
124
- const container = document.getElementById('imageContainer');
125
- const img = document.getElementById('zoomableImage');
126
- let scale = 1;
127
- let isPanning = false;
128
- let startX, startY, translateX = 0, translateY = 0;
129
-
130
- // Zoom functionality
131
- container.addEventListener('wheel', (e) => {{
132
- e.preventDefault();
133
- const rect = container.getBoundingClientRect();
134
- const mouseX = e.clientX - rect.left;
135
- const mouseY = e.clientY - rect.top;
136
-
137
- const delta = e.deltaY * -0.01;
138
- const newScale = Math.max(1, Math.min(scale + delta, 4));
139
- const scaleChange = newScale / scale;
140
-
141
- translateX = mouseX - (mouseX - translateX) * scaleChange;
142
- translateY = mouseY - (mouseY - translateY) * scaleChange;
143
-
144
- scale = newScale;
145
- updateTransform();
146
- }});
147
-
148
- // Pan functionality
149
- container.addEventListener('mousedown', (e) => {{
150
- isPanning = true;
151
- startX = e.clientX - translateX;
152
- startY = e.clientY - translateY;
153
- container.style.cursor = 'grabbing';
154
- }});
155
-
156
- container.addEventListener('mousemove', (e) => {{
157
- if (!isPanning) return;
158
- translateX = e.clientX - startX;
159
- translateY = e.clientY - startY;
160
- updateTransform();
161
- }});
162
-
163
- container.addEventListener('mouseup', () => {{
164
- isPanning = false;
165
- container.style.cursor = 'grab';
166
- }});
167
-
168
- container.addEventListener('mouseleave', () => {{
169
- isPanning = false;
170
- container.style.cursor = 'grab';
171
- }});
172
-
173
- function updateTransform() {{
174
- img.style.transform = `translate(${{translateX}}px, ${{translateY}}px) scale(${{scale}})`;
175
- }}
176
-
177
- // Initialize
178
- container.style.cursor = 'grab';
179
- container.style.height = '500px';
180
- </script>
181
- """
182
 
183
- st.markdown(js_code, unsafe_allow_html=True)
184
-
185
- # Add caption
186
- col1, col2 = st.columns([3, 1])
187
- with col1:
188
- st.caption("💡 Tip: Use mouse wheel to zoom, click and drag to pan around")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- # 2. Detailed Token Analysis
191
- st.header("Token Analysis")
192
- token_df = pd.DataFrame(tokens)
 
 
 
 
193
 
194
- # Create two columns for token distribution and token details
195
- col1, col2 = st.columns([1, 2])
196
 
197
- with col1:
198
- # Token distribution visualization
199
- pos_counts = Counter([token['POS'] for token in tokens])
200
- fig = px.pie(
201
- values=list(pos_counts.values()),
202
- names=list(pos_counts.keys()),
203
- title="Parts of Speech Distribution"
204
- )
205
- fig.update_layout(height=400)
206
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
207
 
208
- with col2:
209
- st.dataframe(token_df, use_container_width=True)
210
-
211
- # Additional Analysis in Expanders
212
- with st.expander("Named Entities"):
213
- if entities:
214
- ent_df = pd.DataFrame(entities)
215
 
216
- # Visualization of entity distribution
217
- entity_counts = Counter([ent['Label'] for ent in entities])
218
- fig = px.bar(
219
- x=list(entity_counts.keys()),
220
- y=list(entity_counts.values()),
221
- title="Distribution of Named Entities",
222
- labels={'x': 'Entity Type', 'y': 'Count'}
223
- )
224
- st.plotly_chart(fig)
225
 
226
- st.table(ent_df)
227
- else:
228
- st.info("No named entities found in the text.")
229
-
230
- with st.expander("Noun Chunks (Phrases)"):
231
- if noun_chunks:
232
- st.table(pd.DataFrame(noun_chunks))
233
- else:
234
- st.info("No noun chunks found in the text.")
235
-
236
- with st.expander("Text Statistics"):
237
- col1, col2, col3 = st.columns(3)
238
- with col1:
239
- st.metric("Word Count", stats['Word Count'])
240
- with col2:
241
- st.metric("Sentence Count", stats['Sentence Count'])
242
- with col3:
243
- st.metric("Unique Words", stats['Unique Words'])
244
 
245
- st.metric("Average Words per Sentence", stats['Average Words per Sentence'])
246
- st.metric("Stop Words Percentage", f"{stats['Stop Words %']}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import spacy
3
+ import graphviz
4
  import pandas as pd
 
 
 
 
5
  import base64
6
+ import shutil
7
+ import subprocess
8
 
9
+ # Load English language model for spaCy
10
+ nlp = spacy.load('en_core_web_md')
11
 
12
+ def check_graphviz_installation():
13
+ """
14
+ Check if Graphviz is installed and accessible
15
+ """
16
+ if shutil.which('dot') is None:
17
+ return False
18
+ try:
19
+ subprocess.run(['dot', '-V'], capture_output=True, check=True)
20
+ return True
21
+ except (subprocess.SubprocessError, OSError):
22
+ return False
23
 
24
+ def identify_clauses(doc):
25
+ """
26
+ Identify clauses in the sentence using spaCy, correctly separating dependent and independent clauses
27
+ """
28
+ clauses = []
29
+
30
+ # First identify all subordinate clauses and their spans
31
+ subordinate_spans = []
32
+ for token in doc:
33
+ if token.dep_ in ["ccomp", "xcomp", "advcl", "relcl"]:
34
+ span = doc[token.left_edge.i:token.right_edge.i + 1]
35
+ subordinate_spans.append({
36
+ "span": span,
37
+ "type": {
38
+ "ccomp": "Complement Clause",
39
+ "xcomp": "Open Complement Clause",
40
+ "advcl": "Adverbial Clause",
41
+ "relcl": "Adjective Clause"
42
+ }[token.dep_]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  })
44
+
45
+ # Find the root and construct the main clause by excluding subordinate spans
46
+ root = None
47
+ for token in doc:
48
+ if token.dep_ == "ROOT":
49
+ root = token
50
+ break
51
+
52
+ if root:
53
+ # Get all tokens in the root's subtree
54
+ main_clause_tokens = set(token for token in root.subtree)
55
 
56
+ # Remove tokens that are part of subordinate clauses
57
+ for sub_clause in subordinate_spans:
58
+ for token in sub_clause["span"]:
59
+ if token in main_clause_tokens:
60
+ main_clause_tokens.remove(token)
 
61
 
62
+ # Construct the main clause text from remaining tokens
63
+ main_clause_text = " ".join(sorted([token.text for token in main_clause_tokens],
64
+ key=lambda x: [t.i for t in doc if t.text == x][0]))
65
+ main_clause_text = main_clause_text.strip().replace(",","").replace(".","")
66
+ clauses.append({"Type": "Independent Clause", "Text": main_clause_text})
67
+
68
+ # Add the subordinate clauses
69
+ for sub_clause in subordinate_spans:
70
+ clauses.append({
71
+ "Type": sub_clause["type"],
72
+ "Text": sub_clause["span"].text
73
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ return clauses
76
+
77
+ def analyze_clause_functions(doc):
78
+ """
79
+ Analyze the function of each clause
80
+ """
81
+ functions = []
82
+
83
+ for token in doc:
84
+ if token.dep_ == "ROOT":
85
+ functions.append({"Type": "Independent Clause", "Function": "Express the primary action or state"})
86
+ elif token.dep_ == "ccomp":
87
+ functions.append({"Type": "Complement Clause", "Function": "Acts as object of the main verb"})
88
+ elif token.dep_ == "xcomp":
89
+ functions.append({"Type": "Open Complement Clause", "Function": "Predicate complement without its own subject"})
90
+ elif token.dep_ == "advcl":
91
+ functions.append({"Type": "Adverbial Clause", "Function": "Modifies the verb like an adverb"})
92
+ elif token.dep_ == "relcl":
93
+ functions.append({"Type": "Adjective Clause", "Function": "Modifies a noun like an adjective"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ return functions
96
+
97
+ def create_dependency_graph(doc):
98
+ """
99
+ Create a graphviz visualization of the dependency tree
100
+ """
101
+ if not check_graphviz_installation():
102
+ return None
103
+
104
+ dot = graphviz.Digraph(comment='Dependency Tree')
105
+
106
+ # Add nodes
107
+ for token in doc:
108
+ dot.node(str(token.i), f"{token.text}\n({token.pos_})")
109
+
110
+ # Add edges
111
+ for token in doc:
112
+ if token.head is not token: # Skip root
113
+ dot.edge(str(token.head.i), str(token.i), token.dep_)
114
+
115
+ return dot
116
 
117
+ def get_graph_download_link(dot):
118
+ """
119
+ Generate a download link for the graph image
120
+ """
121
+ try:
122
+ # Create PDF in memory
123
+ pdf = dot.pipe(format='pdf')
124
 
125
+ # Encode to base64
126
+ b64 = base64.b64encode(pdf).decode()
127
 
128
+ href = f'<a href="data:application/pdf;base64,{b64}" download="syntax_tree.pdf">Download Syntax Tree (PDF)</a>'
129
+ return href
130
+ except Exception as e:
131
+ return f"Error generating download link: {str(e)}"
132
+
133
+ def main():
134
+ # Set page to wide mode for better visualization
135
+ st.set_page_config(layout="wide")
136
+ st.markdown("<h1 style='text-align: center; color: white;'>English Clause Analyzer</h1>", unsafe_allow_html=True)
137
+ st.write("Enter an English sentence to analyze its clauses, their functions, and syntax tree.")
138
+
139
+ # Input text
140
+ text = st.text_area("Enter your sentence:", "When I arrived at the station, the train had already left.", height=100)
141
+
142
+ if st.button("Analyze"):
143
+ if text:
144
+ # Process the text
145
+ doc = nlp(text)
146
 
147
+ # Create two columns for layout
148
+ col1, col2 = st.columns(2)
149
+
150
+ with col1:
151
+ # Identify clauses
152
+ clauses = identify_clauses(doc)
153
+ st.subheader(f"Clauses Analysis")
154
 
155
+ # Convert clauses to DataFrame for better presentation
156
+ df_clauses = pd.DataFrame(clauses)
157
+ st.table(df_clauses.style.set_properties(**{
158
+ 'background-color': 'rgba(0,0,0,0.1)',
159
+ 'color': 'white'
160
+ }))
 
 
 
161
 
162
+ # Display clause functions
163
+ functions = analyze_clause_functions(doc)
164
+ st.subheader("Clause Functions")
165
+ df_functions = pd.DataFrame(functions)
166
+ st.table(df_functions.style.set_properties(**{
167
+ 'background-color': 'rgba(0,0,0,0.1)',
168
+ 'color': 'white'
169
+ }))
 
 
 
 
 
 
 
 
 
 
170
 
171
+ with col2:
172
+ # Display dependency visualization
173
+ st.subheader("Syntax Tree Visualization")
174
+ if not check_graphviz_installation():
175
+ st.error("Graphviz is not installed. Please install it using:")
176
+ st.code("sudo apt-get install graphviz")
177
+ st.markdown("After installation, restart the application.")
178
+ else:
179
+ dot = create_dependency_graph(doc)
180
+ st.graphviz_chart(dot)
181
+
182
+ # Add download button for the graph
183
+ st.markdown(get_graph_download_link(dot), unsafe_allow_html=True)
184
+
185
+ # Display part-of-speech tags in a table
186
+ st.subheader("Part-of-Speech Analysis")
187
+ pos_data = [{"Word": token.text, "Part of Speech": token.pos_,
188
+ "Description": spacy.explain(token.pos_)} for token in doc]
189
+ df_pos = pd.DataFrame(pos_data)
190
+ st.table(df_pos.style.set_properties(**{
191
+ 'background-color': 'rgba(0,0,0,0.1)',
192
+ 'color': 'white'
193
+ }))
194
+
195
+ if __name__ == "__main__":
196
+ main()
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- streamlit
2
- spacy
3
- pandas
4
- plotly
5
- cairosvg
 
1
+ streamlit
2
+ nltk
3
+ spacy
4
+ matplotlib
5
+ graphviz
utils.py DELETED
@@ -1,133 +0,0 @@
1
- import io
2
- from cairosvg import svg2png
3
- from PIL import Image
4
- # import base64
5
-
6
- def get_entity_explanation(label):
7
- """Return explanation for named entity labels"""
8
- explanations = {
9
- 'PERSON': 'People, including fictional',
10
- 'NORP': 'Nationalities, religious or political groups',
11
- 'FAC': 'Buildings, airports, highways, bridges, etc.',
12
- 'ORG': 'Companies, agencies, institutions, etc.',
13
- 'GPE': 'Countries, cities, states',
14
- 'LOC': 'Non-GPE locations, mountain ranges, water bodies',
15
- 'PRODUCT': 'Objects, vehicles, foods, etc.',
16
- 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
17
- 'WORK_OF_ART': 'Titles of books, songs, etc.',
18
- 'DATE': 'Absolute or relative dates or periods',
19
- 'TIME': 'Times smaller than a day',
20
- 'MONEY': 'Monetary values, including unit',
21
- 'QUANTITY': 'Measurements, as of weight or distance'
22
- }
23
- return explanations.get(label, 'Other type of entity')
24
-
25
- def analyze_text(nlp, text):
26
- doc = nlp(text)
27
-
28
- # Basic tokenization and POS analysis
29
- tokens = [{
30
- 'Text': token.text,
31
- 'Lemma': token.lemma_,
32
- 'POS': token.pos_,
33
- 'Tag': token.tag_,
34
- 'Dependency': token.dep_,
35
- 'Shape': token.shape_,
36
- 'Is Alpha': token.is_alpha,
37
- 'Is Stop': token.is_stop
38
- } for token in doc]
39
-
40
- # Named Entity Recognition
41
- entities = [{
42
- 'Text': ent.text,
43
- 'Label': ent.label_,
44
- 'Explanation': get_entity_explanation(ent.label_),
45
- 'Start': ent.start_char,
46
- 'End': ent.end_char
47
- } for ent in doc.ents]
48
-
49
- # Noun Chunks (phrases)
50
- noun_chunks = [{
51
- 'Text': chunk.text,
52
- 'Root Text': chunk.root.text,
53
- 'Root Dep': chunk.root.dep_,
54
- 'Root Head Text': chunk.root.head.text
55
- } for chunk in doc.noun_chunks]
56
-
57
- # Text Statistics
58
- stats = {
59
- 'Word Count': len([token for token in doc if not token.is_punct]),
60
- 'Sentence Count': len(list(doc.sents)),
61
- 'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2),
62
- 'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])),
63
- 'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2)
64
- }
65
-
66
- return tokens, entities, noun_chunks, stats, doc
67
-
68
- def svg_to_png(svg_content, background_color='white'):
69
- """Convert SVG to PNG with specified background color"""
70
- # Split multiple SVGs if present
71
- svg_parts = svg_content.split('<br><br>')
72
- images = []
73
-
74
- for svg in svg_parts:
75
- # Add SVG namespace if missing
76
- if not 'xmlns="http://www.w3.org/2000/svg"' in svg:
77
- svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"')
78
-
79
- try:
80
- # Convert SVG to PNG bytes
81
- png_bytes = svg2png(bytestring=svg.encode('utf-8'),
82
- background_color=background_color,
83
- scale=1)
84
-
85
- # Create PIL Image from PNG bytes
86
- img = Image.open(io.BytesIO(png_bytes))
87
-
88
- # Convert RGBA to RGB with white background
89
- if img.mode == 'RGBA':
90
- background = Image.new('RGB', img.size, background_color)
91
- background.paste(img, mask=img.split()[3]) # Use alpha channel as mask
92
- img = background
93
-
94
- # Add some padding
95
- padding = 20 # pixels
96
- img_with_padding = Image.new('RGB',
97
- (img.width, img.height + padding * 2),
98
- background_color)
99
- img_with_padding.paste(img, (0, padding))
100
- images.append(img_with_padding)
101
-
102
- except Exception as e:
103
- st.error(f"Error converting SVG to PNG: {str(e)}")
104
- continue
105
-
106
- if not images:
107
- return None
108
-
109
- # Combine images vertically if there are multiple
110
- if len(images) > 1:
111
- # Calculate total height and max width
112
- total_height = sum(img.height for img in images)
113
- max_width = max(img.width for img in images)
114
-
115
- # Create new image to hold all sentences
116
- combined = Image.new('RGB', (max_width, total_height), background_color)
117
-
118
- # Paste each image
119
- y_offset = 0
120
- for img in images:
121
- # Center image horizontally
122
- x_offset = (max_width - img.width) // 2
123
- combined.paste(img, (x_offset, y_offset))
124
- y_offset += img.height
125
- else:
126
- combined = images[0]
127
-
128
- # Convert to bytes for Streamlit
129
- img_byte_arr = io.BytesIO()
130
- combined.save(img_byte_arr, format='PNG')
131
- img_byte_arr.seek(0)
132
-
133
- return img_byte_arr.getvalue()