Spaces:

AITestingWorkSpace
/

DocumentAnalysis

Paused

App Files Files Community

vishalsh13 commited on Mar 11

Commit

9cb0411

1 Parent(s): f7ab2c4

changed code

Browse files

Files changed (3) hide show

Dockerfile +27 -13
app.py +202 -94
requirements.txt +7 -20

Dockerfile CHANGED Viewed

@@ -1,18 +1,32 @@
-FROM python:3.10-slim
 WORKDIR /app
-COPY requirements.txt /app/
-RUN pip install --no-cache-dir -r requirements.txt
-COPY . /app/
-# Create both /app/uploads and /app/.cache, and set permissions
-# Try chown first; if it fails, fall back to chmod (less secure)
-RUN mkdir -p /app/uploads /app/.cache && \
-    chown -R 1000:1000 /app/uploads /app/.cache || \
-    chmod -R 777 /app/uploads /app/.cache
-ENV HF_HOME=/app/.cache
-EXPOSE 7860
-# Run app.py when the container launches
-CMD ["python", "app.py"]

+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    build-essential \
+    git \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Set the working directory
 WORKDIR /app
+# Copy requirements file
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip3 install --no-cache-dir -U pip setuptools wheel
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app.py .
+# Set the default command to run the application
+CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -7,9 +7,11 @@ from io import BytesIO, IOBase
 import tempfile
 import re
 import datetime
-import gradio as gr
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 ###############################################################################
 # 1) Logging Configuration
@@ -21,127 +23,234 @@ logging.basicConfig(
 logger = logging.getLogger("LLM-Legal-App")
 ###############################################################################
-# 2) Retrieve API Key (Hugging Face)
 ###############################################################################
-# Use os.environ to get the API key
-api_key = os.environ.get("HUGGINGFACE_API_KEY")
-if not api_key:
-    logger.error("Hugging Face API key not found in environment variables.")
-    raise ValueError("Hugging Face API key not found.  Set it with `os.environ['HUGGINGFACE_API_KEY'] = 'your_api_key'`")
-logger.info("Successfully retrieved Hugging Face API key.")
 ###############################################################################
-# 3) Hugging Face Model and Utility Functions
 ###############################################################################
-# Initialize the Hugging Face model and tokenizer.
-model_name = "Daemontatox/DocumentCogito"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=api_key)
-generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device_map="auto")  # Use device_map="auto"
 def generate_legal_document(doc_type, party_a, party_b, context, country):
     """
-    Uses the Hugging Face model to generate a legal document.
     """
     logger.info(f"Starting generation for doc_type={doc_type!r}.")
     party_a = party_a if party_a else "[Party A Not Provided]"
     party_b = party_b if party_b else "[Party B Not Provided]"
     context = context if context else "[Context Not Provided]"
     prompt = f"""
-Generate a {doc_type} for:
-1) {party_a}
-2) {party_b}
-Context/brief of the agreement:
-{context}.
-The document should include:
-- Purpose of the {doc_type}
-- Responsibilities and obligations of each party
-- Confidentiality terms
-- Payment terms (use [To Be Determined] if not specified)
-- Term (duration) and termination
-- Governing law: {country}
-- Jurisdiction: [Appropriate region in {country} if not provided]
-- Signature blocks
-Use formal language, but keep it relatively clear and readable.
-For any missing information, use placeholders like [To Be Determined].
-Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
-"""
     logger.debug(f"Generated prompt:\n{prompt}")
-    try:
-        # Use the Hugging Face pipeline
-        generated_text = generator(
-            prompt,
-            max_length=1400,  # Adjust as needed
-            num_return_sequences=1,
-            temperature=0.3,  # Adjust as needed
-        )[0]['generated_text']
-        logger.info("Document generation complete.")
-        return generated_text
-    except Exception as e:
-        logger.exception("Error generating legal document.")
-        return f"Error generating document: {e}"
 def review_legal_document(doc_text, doc_type, party_a, party_b):
-    """Reviews document using the Hugging Face model."""
     logger.info("Starting document review (rule-based and wording).")
     # --- Rule-Based Review ---
     rule_based_prompt = f"""
-Review the following document and provide feedback based on these rules:
 Document text:
 \"\"\"
 {doc_text}
 \"\"\"
-1) Parties and Authority: ... (rest of prompt from previous turns) ...
 """
     logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
     try:
-        rule_based_review = generator(
-            rule_based_prompt,
-            max_length=2000,
-            num_return_sequences=1,
-            temperature=0.3,
-        )[0]['generated_text']
     except Exception as e:
         logger.exception("Error during rule-based review.")
         return f"Error during rule-based review: {e}"
     # --- Wording Analysis ---
     wording_analysis_prompt = f"""
-Analyze the wording of the following legal document:
 Document text:
 \"\"\"
 {doc_text}
 \"\"\"
-Provide an analysis covering: ... (rest of prompt from previous turns) ...
 """
     logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
     try:
-        wording_analysis = generator(
-            wording_analysis_prompt,
-            max_length=1400,
-            num_return_sequences=1,
-            temperature=0.3,
-        )[0]['generated_text']
     except Exception as e:
         logger.exception("Error during wording analysis.")
         return f"Error during wording analysis: {e}"
@@ -149,15 +258,13 @@ Provide an analysis covering: ... (rest of prompt from previous turns) ...
     combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
     return combined_review
 ###############################################################################
 # 4) File Parsing (PDF, DOCX)
 ###############################################################################
 def parse_bytesio(file_data: BytesIO) -> str:
-    """Parses a BytesIO object."""
     logger.info("Parsing BytesIO object...")
-    # ... (rest of parse_bytesio function from previous turns) ...
     try:
         # Attempt to determine file type from content
         try:
@@ -181,7 +288,6 @@ def parse_bytesio(file_data: BytesIO) -> str:
 def parse_uploaded_file_path(file_data) -> str:
     """Takes file data, determines type, extracts text."""
-   # ... (rest of parse_uploaded_file_path from previous turns)
     if not file_data:
         logger.warning("No file provided.")
         return ""
@@ -222,7 +328,6 @@ def parse_uploaded_file_path(file_data) -> str:
 def clean_markdown(text):
     """Removes common Markdown formatting."""
-    # ... (rest of clean_markdown from previous turns)
     if not text: return ""
     text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
@@ -237,14 +342,13 @@ def clean_markdown(text):
 def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
     """Creates DOCX, adds review, saves to temp file, returns path."""
     logger.debug("Creating and saving DOCX.")
-    # ... (rest of create_and_save_docx from previous turns) ...
     document = docx.Document()
     now = datetime.datetime.now()
     timestamp = now.strftime("%Y%m%d_%H%M%S")
-    file_name = f"GEN_AI_Review_{doc_type}_{timestamp}.docx"
-    title = f"Gen AI Analysis of {doc_type} between companies {party_a} and {party_b}"
     document.add_heading(title, level=1)
     if doc_text:
@@ -277,7 +381,6 @@ def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a
             else:  # Other sections (if any)
                 document.add_paragraph(section)
     with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
         document.save(tmpfile.name)
         logger.debug(f"DOCX saved to: {tmpfile.name}")
@@ -292,7 +395,7 @@ def generate_document_interface(doc_type, party_a, party_b, context, country):
     logger.info(f"User requested doc generation: {doc_type}, {country}")
     doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
     if doc_text.startswith("Error"):
-        return doc_text, None
     docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
     return doc_text, docx_file_path
@@ -316,6 +419,7 @@ def review_document_interface(file_data, doc_type, party_a, party_b):
 ###############################################################################
 # 7) Build & Launch Gradio App
 ###############################################################################
 custom_css = """
 .tab-one {
     background-color: #D1EEFC; /* Light blue */
@@ -325,13 +429,15 @@ custom_css = """
     background-color: #FCEED1; /* Light orange */
     color: #333;
 }
 """
 def build_app():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown(
             """
-            # UST Global LLM-based Legal Reviewer
             **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
@@ -339,7 +445,7 @@ def build_app():
             """
         )
         with gr.Tabs(selected=1):
-          with gr.Tab("Generate Document",visible=False):
               doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
               party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
               party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
@@ -354,7 +460,7 @@ def build_app():
                   outputs=[gen_output_text, gen_output_file]
               )
-          with gr.Tab("Review Document",elem_classes="tab-one", id=1):
               # Hidden inputs to store values from Generate tab
               doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
               party_a_review = gr.Textbox(label="Party A Name", visible=False)
@@ -372,11 +478,13 @@ def build_app():
               # Copy values from Generate to Review tab (hidden fields)
               gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
-        gr.Markdown("**Note:** Scanned PDFs may not parse correctly.  .docx is generally preferred.")
     return demo
-logger.info("Initializing Gradio interface...")
-demo = build_app()
-logger.info("Launching Gradio app.")
-demo.launch(debug=True)

 import tempfile
 import re
 import datetime
+import torch
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import huggingface_hub
 ###############################################################################
 # 1) Logging Configuration
 logger = logging.getLogger("LLM-Legal-App")
 ###############################################################################
+# 2) Initialize Hugging Face Model
 ###############################################################################
+def initialize_model():
+    """Initialize the DocumentCogito model and tokenizer from HuggingFace."""
+    logger.info("Initializing DocumentCogito model and tokenizer...")
+    try:
+        # Access token might be needed for some models
+        # token = huggingface_hub.get_token()
+        model_name = "Daemontatox/DocumentCogito"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        logger.info("Successfully initialized DocumentCogito model and tokenizer.")
+        return model, tokenizer
+    except Exception as e:
+        logger.exception("Error initializing Hugging Face model.")
+        raise ValueError(f"Failed to initialize model: {e}")
+# Initialize model and tokenizer
+model, tokenizer = initialize_model()
 ###############################################################################
+# 3) LLM Utility Functions (Generation & Review)
 ###############################################################################
+def generate_with_model(prompt, max_length=1400, temperature=0.3):
+    """Generate text using the Hugging Face model."""
+    logger.info("Generating text with DocumentCogito model.")
+    try:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Generate with parameters similar to the original OpenAI call
+        generation_config = {
+            "max_new_tokens": max_length,
+            "temperature": temperature,
+            "top_p": 0.9,
+            "do_sample": temperature > 0,
+            "pad_token_id": tokenizer.eos_token_id
+        }
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **generation_config)
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Remove the prompt from the response
+        if response.startswith(prompt):
+            response = response[len(prompt):].strip()
+        logger.info("Text generation complete.")
+        return response
+    except Exception as e:
+        logger.exception("Error during text generation.")
+        return f"Error generating text: {e}"
 def generate_legal_document(doc_type, party_a, party_b, context, country):
     """
+    Uses DocumentCogito to generate a legal document. Returns the document text.
     """
     logger.info(f"Starting generation for doc_type={doc_type!r}.")
+    # Fill placeholders if fields are missing
     party_a = party_a if party_a else "[Party A Not Provided]"
     party_b = party_b if party_b else "[Party B Not Provided]"
     context = context if context else "[Context Not Provided]"
     prompt = f"""
+    You are a helpful legal assistant. Generate a {doc_type} for:
+    1) {party_a}
+    2) {party_b}
+    Context/brief of the agreement:
+    {context}.
+    The document should include:
+    - Purpose of the {doc_type}
+    - Responsibilities and obligations of each party
+    - Confidentiality terms
+    - Payment terms (use [To Be Determined] if not specified)
+    - Term (duration) and termination
+    - Governing law: {country}
+    - Jurisdiction: [Appropriate region in {country} if not provided]
+    - Signature blocks
+    Use formal language, but keep it relatively clear and readable.
+    For any missing information, use placeholders like [To Be Determined].
+    Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
+    """
     logger.debug(f"Generated prompt:\n{prompt}")
+    return generate_with_model(prompt, max_length=1400, temperature=0.3)
 def review_legal_document(doc_text, doc_type, party_a, party_b):
+    """
+    Reviews document: first with rule-based checks, then wording analysis.
+    """
     logger.info("Starting document review (rule-based and wording).")
     # --- Rule-Based Review ---
     rule_based_prompt = f"""
+You are a legal AI assistant reviewing a document. Provide a review,
+structured into the following numbered sections. Be concise and factual. Do NOT
+use Markdown. Use plain text labels for each section.
 Document text:
 \"\"\"
 {doc_text}
 \"\"\"
+Review Sections:
+1) Parties and Authority:
+    - Confirm the full legal names of all parties.
+    - Make sure the people signing can legally commit their organizations.
+2) Scope of Work / Obligations:
+    - Check that the contract clearly describes what each side must do.
+    - Look for deadlines, milestones, or deliverables.
+    - Ensure everything is realistic and not overly vague.
+3) Definitions and Key Terms:
+    - See if there's a section that explains important terms.
+    - Ensure those terms are used the same way throughout the contract.
+    - Avoid or clarify any ambiguous language.
+4) Payment Terms (If Applicable):
+    - Check how much is owed, the currency, and when it's due.
+    - Look for penalties, interest, or late fees.
+    - Note how and when invoices are sent or paid.
+5) Term and Termination:
+    - Identify when the contract starts and ends.
+    - Understand how it can be renewed.
+    - See the conditions and notice required for ending the contract early.
+6) Intellectual Property (IP) Rights:
+    - Confirm who owns any work created under the agreement.
+    - Note if licenses are granted for using the IP, and for how long.
+7) Confidentiality and Privacy:
+    - Check what is considered confidential information.
+    - Look for exceptions (like already public info).
+    - See how long the confidentiality rules apply.
+8) Warranties and Representations:
+    - Note any performance guarantees or quality promises.
+    - Look for disclaimers (like "as is" clauses).
+9) Indemnification:
+    - See who will pay legal costs or damages if there's a lawsuit or claim.
+    - Check any limits on what's covered.
+10) Limitation of Liability:
+    - Check if there's a maximum amount one side can claim in damages.
+    - Look for excluded damages, like lost profits.
+11) Dispute Resolution and Governing Law:
+    - See if disputes go to arbitration, mediation, or court.
+    - Note which state or country's laws will apply.
+12) Force Majeure (Unforeseen Events):
+    - Look for events like natural disasters or war that could suspend obligations.
+    - See if there are notice requirements for these events.
+13) Notices and Amendments:
+    - Check how official notices must be sent (email, mail, etc.).
+    - Find out how to properly change the contract (in writing, signatures, etc.).
+14) Entire Agreement and Severability:
+    - Confirm that this contract replaces all previous agreements.
+    - Ensure that if one clause is invalid, the rest still stands.
+15) Signatures and Dates:
+    - Make sure the right people sign in their proper roles.
+    - Verify the date of signature and when the contract goes into effect.
+16) Ambiguities, Contradictions, and Hidden Clauses:
+    - Watch for contradictory statements or clauses that conflict.
+    - Beware of vague phrases like "best efforts" without clear guidelines.
+    - Check for hidden or "buried" clauses in fine print or attachments.
+17) Compliance and Regulatory Alignment:
+    - Ensure the contract follows relevant laws and rules.
+    - Check for industry-specific requirements.
+18) Practical Considerations:
+    - Make sure deadlines and other requirements are doable.
+    - Confirm all negotiations are reflected in writing.
+    - Avoid blank or undefined items (like fees or dates "to be decided").
 """
     logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
     try:
+        rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3)
     except Exception as e:
         logger.exception("Error during rule-based review.")
         return f"Error during rule-based review: {e}"
     # --- Wording Analysis ---
     wording_analysis_prompt = f"""
+You are a legal AI assistant. Analyze the following legal document for its wording:
 Document text:
 \"\"\"
 {doc_text}
 \"\"\"
+Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text:
+1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements.
+2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity.
+3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed.
+4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies.
+5. **Redundancy:** Identify any unnecessary repetition of words or phrases.
+6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate.
+7. **Overall Recommendations:** Give overall recommendations for improving the document's wording.
+Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.).
 """
     logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
     try:
+        wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3)
     except Exception as e:
         logger.exception("Error during wording analysis.")
         return f"Error during wording analysis: {e}"
     combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
     return combined_review
 ###############################################################################
 # 4) File Parsing (PDF, DOCX)
 ###############################################################################
 def parse_bytesio(file_data: BytesIO) -> str:
+    """Parses a BytesIO object representing a PDF or DOCX."""
     logger.info("Parsing BytesIO object...")
     try:
         # Attempt to determine file type from content
         try:
 def parse_uploaded_file_path(file_data) -> str:
     """Takes file data, determines type, extracts text."""
     if not file_data:
         logger.warning("No file provided.")
         return ""
 def clean_markdown(text):
     """Removes common Markdown formatting."""
     if not text: return ""
     text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
 def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
     """Creates DOCX, adds review, saves to temp file, returns path."""
     logger.debug("Creating and saving DOCX.")
     document = docx.Document()
     now = datetime.datetime.now()
     timestamp = now.strftime("%Y%m%d_%H%M%S")
+    file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx"
+    title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}"
     document.add_heading(title, level=1)
     if doc_text:
             else:  # Other sections (if any)
                 document.add_paragraph(section)
     with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
         document.save(tmpfile.name)
         logger.debug(f"DOCX saved to: {tmpfile.name}")
     logger.info(f"User requested doc generation: {doc_type}, {country}")
     doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
     if doc_text.startswith("Error"):
+      return doc_text, None
     docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
     return doc_text, docx_file_path
 ###############################################################################
 # 7) Build & Launch Gradio App
 ###############################################################################
+# Define custom CSS in a string.
 custom_css = """
 .tab-one {
     background-color: #D1EEFC; /* Light blue */
     background-color: #FCEED1; /* Light orange */
     color: #333;
 }
+/* If you want to style the tab label differently, you may need to target
+   specific child elements (like a .tab__header) within the class. */
 """
 def build_app():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown(
             """
+            # UST Global Legal Document Analyzer (Hugging Face Version)
             **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
             """
         )
         with gr.Tabs(selected=1):
+          with gr.Tab("Generate Document", visible=False):
               doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
               party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
               party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
                   outputs=[gen_output_text, gen_output_file]
               )
+          with gr.Tab("Review Document", elem_classes="tab-one", id=1):
               # Hidden inputs to store values from Generate tab
               doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
               party_a_review = gr.Textbox(label="Party A Name", visible=False)
               # Copy values from Generate to Review tab (hidden fields)
               gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
+          gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.")
     return demo
+# For Hugging Face Spaces deployment
+if __name__ == "__main__":
+#    create_requirements_file()
+    logger.info("Initializing Gradio interface...")
+    demo = build_app()
+    logger.info("Launching Gradio app.")
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,20 +1,7 @@
-# Core Libraries
-gradio>=4.0,<5.0   # UI framework (tested with 4.14.0, but should work with 4.x)
-python-docx     # For creating DOCX files (python-docx)
-PyPDF2>=3.0.0     # For reading PDF files
-transformers>=4.36.0,<4.37  # Hugging Face Transformers (version range for compatibility)
-sentencepiece>=0.1.9,<0.2.0  # Required by some Hugging Face tokenizers
-torch>=2.1.0    # PyTorch (at least 2.1 for recent transformers compatibility)
-# Other commonly used packages (you might already have these)
-#  - If you encounter installation issues, you can try removing or commenting out
-#    lines for packages you believe are already installed correctly in your
-#    environment.  However, it's generally good practice to include them
-#    for reproducibility.
-typing-extensions>=4.0.0  # For type hints and compatibility (often a dependency)
-requests>=2.0.0            # Used by transformers and other libraries
-filelock>=3.0.0           # Used by transformers for managing cache
-packaging>=20.0           # For version handling
-regex!=2019.12.17         # Used for text processing
-tqdm>=4.27                # For progress bars (used by transformers)
-numpy>=1.17               # Fundamental numerical computing

+gradio>=3.50.2
+transformers>=4.35.0
+torch>=2.0.0
+python-docx>=0.8.11
+PyPDF2>=3.0.0
+huggingface_hub>=0.19.0
+accelerate>=0.20.0