Spaces:

Huy227
/

English_Sentences_Analyzer

Sleeping

App Files Files Community

HuuHuy227 commited on Jan 20

Commit

d6d5bda

1 Parent(s): 7b9f840

new-modified

Browse files

Files changed (4) hide show

Dockerfile +1 -4
app.py +177 -227
requirements.txt +5 -5
utils.py +0 -133

Dockerfile CHANGED Viewed

@@ -6,15 +6,12 @@ WORKDIR /app
 # Install system dependencies for cairosvg
 RUN apt-get update && apt-get install -y \
     build-essential \
     python3-dev \
     python3-pip \
     python3-setuptools \
-    libcairo2-dev \
     pkg-config \
-    libcairo2 \
-    libcairo-gobject2 \
-    python3-cairo \
     libpango1.0-dev \
     shared-mime-info \
     mime-support \

 # Install system dependencies for cairosvg
 RUN apt-get update && apt-get install -y \
+    graphviz \
     build-essential \
     python3-dev \
     python3-pip \
     python3-setuptools \
     pkg-config \
     libpango1.0-dev \
     shared-mime-info \
     mime-support \

app.py CHANGED Viewed

@@ -1,246 +1,196 @@
 import streamlit as st
 import spacy
-from spacy import displacy
 import pandas as pd
-from collections import Counter
-import plotly.express as px
-from utils import analyze_text
-from utils import svg_to_png
 import base64
-# Set page to wide mode for better visualization
-st.set_page_config(layout="wide")
-# Load English language model
-@st.cache_resource
-def load_model():
-    return spacy.load('en_core_web_md')
-nlp = load_model()
-# Streamlit UI
-st.markdown("<h1 style='text-align: center; color: white;'>English Sentences Analyzer</h1>", unsafe_allow_html=True)
-# Text Input and Help side by side
-col1, col2 = st.columns([3, 1])
-with col1:
-    text_input = st.text_area(
-        "Enter English text:",
-        "The ambitious startup in Silicon Valley developed an innovative AI system last year. " +
-        "Google and Microsoft showed interest in acquiring the technology for $50 million.",
-        height=200
-    )
-    analyze_button = st.button("Analyze Text")
-with col2:
-    with st.expander("Quick Guide", expanded=True):
-        st.markdown("""
-        1. Enter your text in the input box
-        2. Click "Analyze Text" to see:
-            - Sentence structure visualization
-            - Detailed token analysis
-            - Additional analysis in expandable sections
-        3. Use mouse wheel or buttons to zoom the visualization
-        4. Click and drag to pan around
-        """)
-if analyze_button:
-    if text_input:
-        tokens, entities, noun_chunks, stats, doc = analyze_text(nlp, text_input)
-        # 1. Dependency Parse with improved visualization
-        st.header("Sentence Structure Analysis")
-        # Generate sentence visualizations
-        sentences = list(doc.sents)
-        sentence_htmls = []
-        for sent in sentences:
-            sent_html = displacy.render(sent, style="dep", options={
-                "distance": 120,
-                "arrow_stroke": 2,
-                "arrow_width": 8,
-                "font": "Arial",
-                "bg": "#ffffff",
             })
-            # Ensure proper SVG structure
-            if not sent_html.startswith('<?xml'):
-                sent_html = '<?xml version="1.0" encoding="UTF-8"?>' + sent_html
-            sentence_htmls.append(sent_html)
-        doc_html = "<br><br>".join(sentence_htmls)
-        # Convert SVG to PNG with error handling
-        png_bytes = svg_to_png(doc_html)
-        if png_bytes is None:
-            st.error("Failed to generate visualization")
-        else:
-            png_b64 = base64.b64encode(png_bytes).decode()
-            # CSS for image container
-            st.markdown("""
-            <style>
-            .image-container {
-                position: relative;
-                overflow: hidden;
-                background: #b4b4b4;
-                border: 1px solid #ddd;
-                border-radius: 5px;
-                margin: 10px 0;
-            }
-            .zoomable-image {
-                transform-origin: 0 0;
-                transition: transform 0.1s;
-            }
-            .download-btn {
-                position: absolute;
-                right: 10px;
-                top: 10px;
-                background: rgba(255, 255, 255, 0.8);
-                border: 1px solid #ddd;
-                border-radius: 4px;
-                padding: 5px 10px;
-                cursor: pointer;
-            }
-            .download-btn:hover {
-                background: white;
-            }
-            </style>
-            """, unsafe_allow_html=True)
-            # JavaScript for zoom and pan functionality
-            js_code = f"""
-            <div class="image-container" id="imageContainer">
-                <img src="data:image/png;base64,{png_b64}"
-                     class="zoomable-image"
-                     id="zoomableImage"
-                     style="max-width: 100%;">
-                <a class="download-btn"
-                   href="data:image/png;base64,{png_b64}"
-                   download="sentence_structure.png">
-                   📥 Download
-                </a>
-            </div>
-            <script>
-                const container = document.getElementById('imageContainer');
-                const img = document.getElementById('zoomableImage');
-                let scale = 1;
-                let isPanning = false;
-                let startX, startY, translateX = 0, translateY = 0;
-                // Zoom functionality
-                container.addEventListener('wheel', (e) => {{
-                    e.preventDefault();
-                    const rect = container.getBoundingClientRect();
-                    const mouseX = e.clientX - rect.left;
-                    const mouseY = e.clientY - rect.top;
-                    const delta = e.deltaY * -0.01;
-                    const newScale = Math.max(1, Math.min(scale + delta, 4));
-                    const scaleChange = newScale / scale;
-                    translateX = mouseX - (mouseX - translateX) * scaleChange;
-                    translateY = mouseY - (mouseY - translateY) * scaleChange;
-                    scale = newScale;
-                    updateTransform();
-                }});
-                // Pan functionality
-                container.addEventListener('mousedown', (e) => {{
-                    isPanning = true;
-                    startX = e.clientX - translateX;
-                    startY = e.clientY - translateY;
-                    container.style.cursor = 'grabbing';
-                }});
-                container.addEventListener('mousemove', (e) => {{
-                    if (!isPanning) return;
-                    translateX = e.clientX - startX;
-                    translateY = e.clientY - startY;
-                    updateTransform();
-                }});
-                container.addEventListener('mouseup', () => {{
-                    isPanning = false;
-                    container.style.cursor = 'grab';
-                }});
-                container.addEventListener('mouseleave', () => {{
-                    isPanning = false;
-                    container.style.cursor = 'grab';
-                }});
-                function updateTransform() {{
-                    img.style.transform = `translate(${{translateX}}px, ${{translateY}}px) scale(${{scale}})`;
-                }}
-                // Initialize
-                container.style.cursor = 'grab';
-                container.style.height = '500px';
-            </script>
-            """
-            st.markdown(js_code, unsafe_allow_html=True)
-            # Add caption
-            col1, col2 = st.columns([3, 1])
-            with col1:
-                st.caption("💡 Tip: Use mouse wheel to zoom, click and drag to pan around")
-        # 2. Detailed Token Analysis
-        st.header("Token Analysis")
-        token_df = pd.DataFrame(tokens)
-        # Create two columns for token distribution and token details
-        col1, col2 = st.columns([1, 2])
-        with col1:
-            # Token distribution visualization
-            pos_counts = Counter([token['POS'] for token in tokens])
-            fig = px.pie(
-                values=list(pos_counts.values()),
-                names=list(pos_counts.keys()),
-                title="Parts of Speech Distribution"
-            )
-            fig.update_layout(height=400)
-            st.plotly_chart(fig, use_container_width=True)
-        with col2:
-            st.dataframe(token_df, use_container_width=True)
-        # Additional Analysis in Expanders
-        with st.expander("Named Entities"):
-            if entities:
-                ent_df = pd.DataFrame(entities)
-                # Visualization of entity distribution
-                entity_counts = Counter([ent['Label'] for ent in entities])
-                fig = px.bar(
-                    x=list(entity_counts.keys()),
-                    y=list(entity_counts.values()),
-                    title="Distribution of Named Entities",
-                    labels={'x': 'Entity Type', 'y': 'Count'}
-                )
-                st.plotly_chart(fig)
-                st.table(ent_df)
-            else:
-                st.info("No named entities found in the text.")
-        with st.expander("Noun Chunks (Phrases)"):
-            if noun_chunks:
-                st.table(pd.DataFrame(noun_chunks))
-            else:
-                st.info("No noun chunks found in the text.")
-        with st.expander("Text Statistics"):
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Word Count", stats['Word Count'])
-            with col2:
-                st.metric("Sentence Count", stats['Sentence Count'])
-            with col3:
-                st.metric("Unique Words", stats['Unique Words'])
-            st.metric("Average Words per Sentence", stats['Average Words per Sentence'])
-            st.metric("Stop Words Percentage", f"{stats['Stop Words %']}%")

 import streamlit as st
 import spacy
+import graphviz
 import pandas as pd
 import base64
+import shutil
+import subprocess
+# Load English language model for spaCy
+nlp = spacy.load('en_core_web_md')
+def check_graphviz_installation():
+    """
+    Check if Graphviz is installed and accessible
+    """
+    if shutil.which('dot') is None:
+        return False
+    try:
+        subprocess.run(['dot', '-V'], capture_output=True, check=True)
+        return True
+    except (subprocess.SubprocessError, OSError):
+        return False
+def identify_clauses(doc):
+    """
+    Identify clauses in the sentence using spaCy, correctly separating dependent and independent clauses
+    """
+    clauses = []
+    # First identify all subordinate clauses and their spans
+    subordinate_spans = []
+    for token in doc:
+        if token.dep_ in ["ccomp", "xcomp", "advcl", "relcl"]:
+            span = doc[token.left_edge.i:token.right_edge.i + 1]
+            subordinate_spans.append({
+                "span": span,
+                "type": {
+                    "ccomp": "Complement Clause",
+                    "xcomp": "Open Complement Clause",
+                    "advcl": "Adverbial Clause",
+                    "relcl": "Adjective Clause"
+                }[token.dep_]
             })
+    # Find the root and construct the main clause by excluding subordinate spans
+    root = None
+    for token in doc:
+        if token.dep_ == "ROOT":
+            root = token
+            break
+    if root:
+        # Get all tokens in the root's subtree
+        main_clause_tokens = set(token for token in root.subtree)
+        # Remove tokens that are part of subordinate clauses
+        for sub_clause in subordinate_spans:
+            for token in sub_clause["span"]:
+                if token in main_clause_tokens:
+                    main_clause_tokens.remove(token)
+        # Construct the main clause text from remaining tokens
+        main_clause_text = " ".join(sorted([token.text for token in main_clause_tokens],
+                                         key=lambda x: [t.i for t in doc if t.text == x][0]))
+        main_clause_text = main_clause_text.strip().replace(",","").replace(".","")
+        clauses.append({"Type": "Independent Clause", "Text": main_clause_text})
+    # Add the subordinate clauses
+    for sub_clause in subordinate_spans:
+        clauses.append({
+            "Type": sub_clause["type"],
+            "Text": sub_clause["span"].text
+        })
+    return clauses
+def analyze_clause_functions(doc):
+    """
+    Analyze the function of each clause
+    """
+    functions = []
+    for token in doc:
+        if token.dep_ == "ROOT":
+            functions.append({"Type": "Independent Clause", "Function": "Express the primary action or state"})
+        elif token.dep_ == "ccomp":
+            functions.append({"Type": "Complement Clause", "Function": "Acts as object of the main verb"})
+        elif token.dep_ == "xcomp":
+            functions.append({"Type": "Open Complement Clause", "Function": "Predicate complement without its own subject"})
+        elif token.dep_ == "advcl":
+            functions.append({"Type": "Adverbial Clause", "Function": "Modifies the verb like an adverb"})
+        elif token.dep_ == "relcl":
+            functions.append({"Type": "Adjective Clause", "Function": "Modifies a noun like an adjective"})
+    return functions
+def create_dependency_graph(doc):
+    """
+    Create a graphviz visualization of the dependency tree
+    """
+    if not check_graphviz_installation():
+        return None
+    dot = graphviz.Digraph(comment='Dependency Tree')
+    # Add nodes
+    for token in doc:
+        dot.node(str(token.i), f"{token.text}\n({token.pos_})")
+    # Add edges
+    for token in doc:
+        if token.head is not token:  # Skip root
+            dot.edge(str(token.head.i), str(token.i), token.dep_)
+    return dot
+def get_graph_download_link(dot):
+    """
+    Generate a download link for the graph image
+    """
+    try:
+        # Create PDF in memory
+        pdf = dot.pipe(format='pdf')
+        # Encode to base64
+        b64 = base64.b64encode(pdf).decode()
+        href = f'<a href="data:application/pdf;base64,{b64}" download="syntax_tree.pdf">Download Syntax Tree (PDF)</a>'
+        return href
+    except Exception as e:
+        return f"Error generating download link: {str(e)}"
+def main():
+    # Set page to wide mode for better visualization
+    st.set_page_config(layout="wide")
+    st.markdown("<h1 style='text-align: center; color: white;'>English Clause Analyzer</h1>", unsafe_allow_html=True)
+    st.write("Enter an English sentence to analyze its clauses, their functions, and syntax tree.")
+    # Input text
+    text = st.text_area("Enter your sentence:", "When I arrived at the station, the train had already left.", height=100)
+    if st.button("Analyze"):
+        if text:
+            # Process the text
+            doc = nlp(text)
+            # Create two columns for layout
+            col1, col2 = st.columns(2)
+            with col1:
+                # Identify clauses
+                clauses = identify_clauses(doc)
+                st.subheader(f"Clauses Analysis")
+                # Convert clauses to DataFrame for better presentation
+                df_clauses = pd.DataFrame(clauses)
+                st.table(df_clauses.style.set_properties(**{
+                    'background-color': 'rgba(0,0,0,0.1)',
+                    'color': 'white'
+                }))
+                # Display clause functions
+                functions = analyze_clause_functions(doc)
+                st.subheader("Clause Functions")
+                df_functions = pd.DataFrame(functions)
+                st.table(df_functions.style.set_properties(**{
+                    'background-color': 'rgba(0,0,0,0.1)',
+                    'color': 'white'
+                }))
+            with col2:
+                # Display dependency visualization
+                st.subheader("Syntax Tree Visualization")
+                if not check_graphviz_installation():
+                    st.error("Graphviz is not installed. Please install it using:")
+                    st.code("sudo apt-get install graphviz")
+                    st.markdown("After installation, restart the application.")
+                else:
+                    dot = create_dependency_graph(doc)
+                    st.graphviz_chart(dot)
+                    # Add download button for the graph
+                    st.markdown(get_graph_download_link(dot), unsafe_allow_html=True)
+                    # Display part-of-speech tags in a table
+                    st.subheader("Part-of-Speech Analysis")
+                    pos_data = [{"Word": token.text, "Part of Speech": token.pos_,
+                            "Description": spacy.explain(token.pos_)} for token in doc]
+                    df_pos = pd.DataFrame(pos_data)
+                    st.table(df_pos.style.set_properties(**{
+                        'background-color': 'rgba(0,0,0,0.1)',
+                        'color': 'white'
+                    }))
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-streamlit
-spacy
-pandas
-plotly
-cairosvg

+streamlit
+nltk
+spacy
+matplotlib
+graphviz

utils.py DELETED Viewed

@@ -1,133 +0,0 @@
-import io
-from cairosvg import svg2png
-from PIL import Image
-# import base64
-def get_entity_explanation(label):
-    """Return explanation for named entity labels"""
-    explanations = {
-        'PERSON': 'People, including fictional',
-        'NORP': 'Nationalities, religious or political groups',
-        'FAC': 'Buildings, airports, highways, bridges, etc.',
-        'ORG': 'Companies, agencies, institutions, etc.',
-        'GPE': 'Countries, cities, states',
-        'LOC': 'Non-GPE locations, mountain ranges, water bodies',
-        'PRODUCT': 'Objects, vehicles, foods, etc.',
-        'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
-        'WORK_OF_ART': 'Titles of books, songs, etc.',
-        'DATE': 'Absolute or relative dates or periods',
-        'TIME': 'Times smaller than a day',
-        'MONEY': 'Monetary values, including unit',
-        'QUANTITY': 'Measurements, as of weight or distance'
-    }
-    return explanations.get(label, 'Other type of entity')
-def analyze_text(nlp, text):
-    doc = nlp(text)
-    # Basic tokenization and POS analysis
-    tokens = [{
-        'Text': token.text,
-        'Lemma': token.lemma_,
-        'POS': token.pos_,
-        'Tag': token.tag_,
-        'Dependency': token.dep_,
-        'Shape': token.shape_,
-        'Is Alpha': token.is_alpha,
-        'Is Stop': token.is_stop
-    } for token in doc]
-    # Named Entity Recognition
-    entities = [{
-        'Text': ent.text,
-        'Label': ent.label_,
-        'Explanation': get_entity_explanation(ent.label_),
-        'Start': ent.start_char,
-        'End': ent.end_char
-    } for ent in doc.ents]
-    # Noun Chunks (phrases)
-    noun_chunks = [{
-        'Text': chunk.text,
-        'Root Text': chunk.root.text,
-        'Root Dep': chunk.root.dep_,
-        'Root Head Text': chunk.root.head.text
-    } for chunk in doc.noun_chunks]
-    # Text Statistics
-    stats = {
-        'Word Count': len([token for token in doc if not token.is_punct]),
-        'Sentence Count': len(list(doc.sents)),
-        'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2),
-        'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])),
-        'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2)
-    }
-    return tokens, entities, noun_chunks, stats, doc
-def svg_to_png(svg_content, background_color='white'):
-    """Convert SVG to PNG with specified background color"""
-    # Split multiple SVGs if present
-    svg_parts = svg_content.split('<br><br>')
-    images = []
-    for svg in svg_parts:
-        # Add SVG namespace if missing
-        if not 'xmlns="http://www.w3.org/2000/svg"' in svg:
-            svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"')
-        try:
-            # Convert SVG to PNG bytes
-            png_bytes = svg2png(bytestring=svg.encode('utf-8'),
-                              background_color=background_color,
-                              scale=1)
-            # Create PIL Image from PNG bytes
-            img = Image.open(io.BytesIO(png_bytes))
-            # Convert RGBA to RGB with white background
-            if img.mode == 'RGBA':
-                background = Image.new('RGB', img.size, background_color)
-                background.paste(img, mask=img.split()[3])  # Use alpha channel as mask
-                img = background
-            # Add some padding
-            padding = 20  # pixels
-            img_with_padding = Image.new('RGB',
-                                       (img.width, img.height + padding * 2),
-                                       background_color)
-            img_with_padding.paste(img, (0, padding))
-            images.append(img_with_padding)
-        except Exception as e:
-            st.error(f"Error converting SVG to PNG: {str(e)}")
-            continue
-    if not images:
-        return None
-    # Combine images vertically if there are multiple
-    if len(images) > 1:
-        # Calculate total height and max width
-        total_height = sum(img.height for img in images)
-        max_width = max(img.width for img in images)
-        # Create new image to hold all sentences
-        combined = Image.new('RGB', (max_width, total_height), background_color)
-        # Paste each image
-        y_offset = 0
-        for img in images:
-            # Center image horizontally
-            x_offset = (max_width - img.width) // 2
-            combined.paste(img, (x_offset, y_offset))
-            y_offset += img.height
-    else:
-        combined = images[0]
-    # Convert to bytes for Streamlit
-    img_byte_arr = io.BytesIO()
-    combined.save(img_byte_arr, format='PNG')
-    img_byte_arr.seek(0)
-    return img_byte_arr.getvalue()