Spaces:

walaa2022
/

financial-analysis-system

Sleeping

App Files Files Community

walaa2022 commited on Nov 26, 2024

Commit

324809c

verified ·

1 Parent(s): 91033f9

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -87

app.py CHANGED Viewed

@@ -4,9 +4,8 @@ import pandas as pd
 import torch
 import logging
 import gc
-import signal
-from contextlib import contextmanager
-import psutil
 from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 # Setup logging
@@ -20,33 +19,15 @@ logger = logging.getLogger(__name__)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {DEVICE}")
-def monitor_memory():
-    """Monitor system memory usage"""
-    try:
-        process = psutil.Process(os.getpid())
-        memory_info = process.memory_info()
-        logger.info(f"Memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
-        if DEVICE == "cuda":
-            logger.info(f"GPU Memory: {torch.cuda.max_memory_allocated() / 1024 / 1024:.2f} MB")
-    except Exception as e:
-        logger.error(f"Error monitoring memory: {str(e)}")
 def clear_gpu_memory():
     """Utility function to clear GPU memory"""
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
-@contextmanager
-def timeout_context(seconds):
-    def signal_handler(signum, frame):
-        raise TimeoutError(f"Operation timed out after {seconds} seconds")
-    signal.signal(signal.SIGALRM, signal_handler)
-    signal.alarm(seconds)
-    try:
-        yield
-    finally:
-        signal.alarm(0)
 class ModelManager:
     """Handles model loading and inference"""
@@ -59,40 +40,46 @@ class ModelManager:
         self.max_cache_size = 2
     def load_model(self, model_name, model_type="sentiment", timeout=300):
-        """Load model and tokenizer with timeout"""
         try:
             if model_name in self.model_cache:
                 self.models[model_name] = self.model_cache[model_name]
                 logger.info(f"Loaded {model_name} from cache")
                 return
-            with timeout_context(timeout):
-                if model_name not in self.models:
-                    if model_type == "sentiment":
-                        self.tokenizers[model_name] = AutoTokenizer.from_pretrained(
-                            model_name,
-                            use_fast=True
-                        )
-                        self.models[model_name] = AutoModelForSequenceClassification.from_pretrained(
-                            model_name,
-                            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
-                        ).to(self.device)
-                    else:
-                        self.models[model_name] = pipeline(
-                            "text-generation",
-                            model=model_name,
-                            device_map="auto" if self.device == "cuda" else None,
-                            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
-                        )
-                    # Cache the model
-                    self.cache_model(model_name, self.models[model_name])
-                    logger.info(f"Successfully loaded model: {model_name}")
-                    monitor_memory()
         except Exception as e:
             logger.error(f"Error loading model {model_name}: {str(e)}")
-            raise
     def cache_model(self, model_name, model):
         """Cache model for faster reloading"""
@@ -132,9 +119,10 @@ class FinancialAnalyzer:
             "recommendation": "tiiuae/falcon-rw-1b"
         }
-        # Load sentiment model at initialization
         try:
-            self.model_manager.load_model(self.models["sentiment"], "sentiment")
         except Exception as e:
             logger.error(f"Failed to initialize sentiment model: {str(e)}")
             raise
@@ -186,12 +174,17 @@ class FinancialAnalyzer:
             if len(text) == 0:
                 raise ValueError("Empty text input")
             # Tokenize with proper padding and truncation
             inputs = tokenizer(
                 text,
                 return_tensors="pt",
                 truncation=True,
-                max_length=512,
                 padding=True
             ).to(DEVICE)
@@ -217,10 +210,16 @@ class FinancialAnalyzer:
             return [{"label": "error", "score": 1.0}]
     def generate_analysis(self, financial_data):
-        """Generate strategic analysis with improved prompting"""
         try:
             model_name = self.models["analysis"]
-            self.model_manager.load_model(model_name, "generation")
             prompt = f"""[INST] As a senior financial analyst, provide a detailed analysis of these financial statements:
@@ -256,6 +255,7 @@ class FinancialAnalyzer:
             Provide specific metrics and detailed explanations for each section. [/INST]"""
             response = self.model_manager.get_model(model_name)(
                 prompt,
                 max_length=2000,
@@ -283,12 +283,27 @@ class FinancialAnalyzer:
             sections = text.split('\n\n')
             formatted_sections = []
             for section in sections:
-                if section.strip():
-                    if any(section.startswith(str(i)) for i in range(1, 6)):
-                        formatted_sections.append(f"### {section}")
-                    else:
-                        formatted_sections.append(section)
             return '\n\n'.join(formatted_sections)
         except Exception as e:
@@ -296,10 +311,16 @@ class FinancialAnalyzer:
             return text
     def generate_recommendations(self, analysis):
-        """Generate recommendations with comprehensive prompting"""
         try:
             model_name = self.models["recommendation"]
-            self.model_manager.load_model(model_name, "generation")
             prompt = f"""Based on this financial analysis, provide detailed strategic recommendations:
@@ -341,6 +362,7 @@ class FinancialAnalyzer:
             Format each section with clear, actionable bullet points."""
             response = self.model_manager.get_model(model_name)(
                 prompt,
                 max_length=2000,
@@ -368,12 +390,27 @@ class FinancialAnalyzer:
             sections = text.split('\n\n')
             formatted_sections = []
             for section in sections:
-                if section.strip():
-                    if any(section.startswith(str(i)) for i in range(1, 6)):
-                        formatted_sections.append(f"### {section}")
-                    else:
-                        formatted_sections.append(section)
             return '\n\n'.join(formatted_sections)
         except Exception as e:
@@ -383,7 +420,8 @@ class FinancialAnalyzer:
 def analyze_financial_statements(income_statement, balance_sheet):
     """Main analysis function with improved error handling and logging"""
     try:
-        monitor_memory()
         analyzer = FinancialAnalyzer()
         # Validate inputs
@@ -391,8 +429,9 @@ def analyze_financial_statements(income_statement, balance_sheet):
             return "Error: Please provide both income statement and balance sheet files"
         # Process financial statements
-        logger.info("Processing financial statements...")
         income_summary = analyzer.read_csv(income_statement)
         balance_summary = analyzer.read_csv(balance_sheet)
         financial_data = f"""
@@ -404,20 +443,32 @@ def analyze_financial_statements(income_statement, balance_sheet):
         """
         # Generate analysis
-        logger.info("Generating analysis...")
         analysis = analyzer.generate_analysis(financial_data)
         # Analyze sentiment
-        logger.info("Analyzing sentiment...")
         sentiment = analyzer.analyze_sentiment(analysis)
         # Generate recommendations
-        logger.info("Generating recommendations...")
         recommendations = analyzer.generate_recommendations(analysis)
         # Format results
         result = format_results(analysis, sentiment, recommendations)
-        monitor_memory()
         return result
     except Exception as e:
@@ -429,7 +480,9 @@ def analyze_financial_statements(income_statement, balance_sheet):
         Please verify:
         1. Files are valid CSV format
         2. Files contain required financial data
-        3. File size is within limits"""
 def format_results(analysis, sentiment, recommendations):
     """Format analysis results with improved validation and formatting"""
@@ -458,34 +511,67 @@ def format_results(analysis, sentiment, recommendations):
         logger.error(f"Formatting error: {str(e)}")
         return "Error formatting results"
-# Create Gradio interface with improved error handling
 iface = gr.Interface(
     fn=analyze_financial_statements,
     inputs=[
-        gr.File(label="Income Statement (CSV)"),
-        gr.File(label="Balance Sheet (CSV)")
     ],
     outputs=gr.Markdown(),
-    title="Financial Statement Analyzer",
-    description="""Upload financial statements for AI-powered analysis:
-    - Strategic Analysis (TinyLlama)
-    - Sentiment Analysis (FinBERT)
-    - Strategic Recommendations (Falcon)
-    Note:
-    - Files must be in CSV format
-    - Each file should contain financial data in columns
-    - Maximum file size: 10MB""",
     flagging_mode="never"
 )
 if __name__ == "__main__":
     try:
         iface.queue()
         iface.launch(
             share=False,
             server_name="0.0.0.0",
-            server_port=7860
         )
     except Exception as e:
         logger.error(f"Launch error: {str(e)}")

 import torch
 import logging
 import gc
+import threading
+import concurrent.futures
 from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 # Setup logging
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Using device: {DEVICE}")
 def clear_gpu_memory():
     """Utility function to clear GPU memory"""
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
+class ModelLoadingError(Exception):
+    """Custom exception for model loading errors"""
+    pass
 class ModelManager:
     """Handles model loading and inference"""
         self.max_cache_size = 2
     def load_model(self, model_name, model_type="sentiment", timeout=300):
+        """Load model and tokenizer with thread-safe timeout"""
         try:
             if model_name in self.model_cache:
                 self.models[model_name] = self.model_cache[model_name]
                 logger.info(f"Loaded {model_name} from cache")
                 return
+            def load_model_task():
+                if model_type == "sentiment":
+                    self.tokenizers[model_name] = AutoTokenizer.from_pretrained(
+                        model_name,
+                        use_fast=True
+                    )
+                    self.models[model_name] = AutoModelForSequenceClassification.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+                    ).to(self.device)
+                else:
+                    self.models[model_name] = pipeline(
+                        "text-generation",
+                        model=model_name,
+                        device_map="auto" if self.device == "cuda" else None,
+                        torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+                    )
+            # Use ThreadPoolExecutor for timeout
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(load_model_task)
+                try:
+                    future.result(timeout=timeout)
+                except concurrent.futures.TimeoutError:
+                    raise ModelLoadingError(f"Model loading timed out after {timeout} seconds")
+            # Cache the model
+            self.cache_model(model_name, self.models[model_name])
+            logger.info(f"Successfully loaded model: {model_name}")
         except Exception as e:
             logger.error(f"Error loading model {model_name}: {str(e)}")
+            raise ModelLoadingError(f"Failed to load model {model_name}: {str(e)}")
     def cache_model(self, model_name, model):
         """Cache model for faster reloading"""
             "recommendation": "tiiuae/falcon-rw-1b"
         }
+        # Load sentiment model at initialization with longer timeout
         try:
+            self.model_manager.load_model(self.models["sentiment"], "sentiment", timeout=600)
+            logger.info("Sentiment model initialized successfully")
         except Exception as e:
             logger.error(f"Failed to initialize sentiment model: {str(e)}")
             raise
             if len(text) == 0:
                 raise ValueError("Empty text input")
+            # Truncate text if too long
+            max_length = 512
+            if len(text.split()) > max_length:
+                logger.warning(f"Text length exceeds {max_length} tokens. Truncating...")
             # Tokenize with proper padding and truncation
             inputs = tokenizer(
                 text,
                 return_tensors="pt",
                 truncation=True,
+                max_length=max_length,
                 padding=True
             ).to(DEVICE)
             return [{"label": "error", "score": 1.0}]
     def generate_analysis(self, financial_data):
+        """Generate strategic analysis with improved prompting and error handling"""
         try:
             model_name = self.models["analysis"]
+            self.model_manager.load_model(model_name, "generation", timeout=600)
+            # Truncate financial data if too long
+            max_data_length = 1000
+            if len(financial_data.split()) > max_data_length:
+                logger.warning(f"Financial data too long. Truncating to {max_data_length} tokens...")
+                financial_data = ' '.join(financial_data.split()[:max_data_length])
             prompt = f"""[INST] As a senior financial analyst, provide a detailed analysis of these financial statements:
             Provide specific metrics and detailed explanations for each section. [/INST]"""
+            logger.info("Generating analysis...")
             response = self.model_manager.get_model(model_name)(
                 prompt,
                 max_length=2000,
             sections = text.split('\n\n')
             formatted_sections = []
+            current_section = None
             for section in sections:
+                section = section.strip()
+                if not section:
+                    continue
+                # Check if this is a new section
+                if any(section.startswith(str(i)) for i in range(1, 6)):
+                    current_section = f"### {section}"
+                    formatted_sections.append(current_section)
+                elif current_section:
+                    # Add bullet points to content under sections
+                    lines = section.split('\n')
+                    formatted_lines = []
+                    for line in lines:
+                        line = line.strip()
+                        if line:
+                            if not line.startswith('- '):
+                                line = f"- {line}"
+                            formatted_lines.append(line)
+                    formatted_sections.append('\n'.join(formatted_lines))
             return '\n\n'.join(formatted_sections)
         except Exception as e:
             return text
     def generate_recommendations(self, analysis):
+        """Generate recommendations with improved prompting and error handling"""
         try:
             model_name = self.models["recommendation"]
+            self.model_manager.load_model(model_name, "generation", timeout=600)
+            # Truncate analysis if too long
+            max_analysis_length = 1000
+            if len(analysis.split()) > max_analysis_length:
+                logger.warning(f"Analysis too long. Truncating to {max_analysis_length} tokens...")
+                analysis = ' '.join(analysis.split()[:max_analysis_length])
             prompt = f"""Based on this financial analysis, provide detailed strategic recommendations:
             Format each section with clear, actionable bullet points."""
+            logger.info("Generating recommendations...")
             response = self.model_manager.get_model(model_name)(
                 prompt,
                 max_length=2000,
             sections = text.split('\n\n')
             formatted_sections = []
+            current_section = None
             for section in sections:
+                section = section.strip()
+                if not section:
+                    continue
+                # Check if this is a new section
+                if any(section.startswith(str(i)) for i in range(1, 6)):
+                    current_section = f"### {section}"
+                    formatted_sections.append(current_section)
+                elif current_section:
+                    # Add bullet points to content under sections
+                    lines = section.split('\n')
+                    formatted_lines = []
+                    for line in lines:
+                        line = line.strip()
+                        if line:
+                            if not line.startswith('- '):
+                                line = f"- {line}"
+                            formatted_lines.append(line)
+                    formatted_sections.append('\n'.join(formatted_lines))
             return '\n\n'.join(formatted_sections)
         except Exception as e:
 def analyze_financial_statements(income_statement, balance_sheet):
     """Main analysis function with improved error handling and logging"""
     try:
+        clear_gpu_memory()
+        logger.info("Starting financial analysis...")
         analyzer = FinancialAnalyzer()
         # Validate inputs
             return "Error: Please provide both income statement and balance sheet files"
         # Process financial statements
+        logger.info("Processing income statement...")
         income_summary = analyzer.read_csv(income_statement)
+        logger.info("Processing balance sheet...")
         balance_summary = analyzer.read_csv(balance_sheet)
         financial_data = f"""
         """
         # Generate analysis
+        logger.info("Starting strategic analysis generation...")
         analysis = analyzer.generate_analysis(financial_data)
+        if "Error" in analysis:
+            logger.error("Strategic analysis generation failed")
+            return "Error: Failed to generate strategic analysis. Please try again."
         # Analyze sentiment
+        logger.info("Starting sentiment analysis...")
         sentiment = analyzer.analyze_sentiment(analysis)
+        if sentiment[0][0]['label'] == "error":
+            logger.error("Sentiment analysis failed")
+            return "Error: Failed to analyze sentiment. Please try again."
         # Generate recommendations
+        logger.info("Starting recommendations generation...")
         recommendations = analyzer.generate_recommendations(analysis)
+        if "Error" in recommendations:
+            logger.error("Recommendations generation failed")
+            return "Error: Failed to generate recommendations. Please try again."
         # Format results
+        logger.info("Formatting final results...")
         result = format_results(analysis, sentiment, recommendations)
+        clear_gpu_memory()
+        logger.info("Analysis completed successfully")
         return result
     except Exception as e:
         Please verify:
         1. Files are valid CSV format
         2. Files contain required financial data
+        3. File size is within limits (max 10MB)
+        4. Data contains numeric columns
+        5. Files are not corrupted"""
 def format_results(analysis, sentiment, recommendations):
     """Format analysis results with improved validation and formatting"""
         logger.error(f"Formatting error: {str(e)}")
         return "Error formatting results"
+# Create Gradio interface with improved error handling and guidance
 iface = gr.Interface(
     fn=analyze_financial_statements,
     inputs=[
+        gr.File(
+            label="Income Statement (CSV)",
+            info="Upload income statement in CSV format with numeric data columns"
+        ),
+        gr.File(
+            label="Balance Sheet (CSV)",
+            info="Upload balance sheet in CSV format with numeric data columns"
+        )
     ],
     outputs=gr.Markdown(),
+    title="AI-Powered Financial Statement Analyzer",
+    description="""## Financial Statement Analysis Tool
+This tool provides comprehensive financial analysis using advanced AI models:
+- Strategic Analysis: In-depth analysis of financial position and trends
+- Sentiment Analysis: Assessment of financial health sentiment
+- Strategic Recommendations: Actionable insights and recommendations
+Requirements:
+- Files must be in CSV format
+- Must contain numeric data columns
+- Maximum file size: 10MB
+- Standard financial statement format preferred
+Note: Analysis may take a few minutes to complete.""",
+    article="""### Usage Tips:
+1. Ensure your CSV files have clear column headers
+2. Verify that numeric data is properly formatted
+3. Wait for the analysis to complete - it may take several minutes
+4. The more detailed your financial data, the better the analysis
+For optimal results, include key financial metrics such as:
+- Revenue
+- Expenses
+- Profits/Losses
+- Assets
+- Liabilities
+- Equity""",
+    examples=[
+        ["example_income_statement.csv", "example_balance_sheet.csv"]
+    ],
     flagging_mode="never"
 )
+# Launch the interface with proper error handling
 if __name__ == "__main__":
     try:
+        # Enable queue for better handling of multiple requests
         iface.queue()
+        # Launch with specific server configuration
         iface.launch(
             share=False,
             server_name="0.0.0.0",
+            server_port=7860,
+            show_error=True,
+            max_threads=4
         )
     except Exception as e:
         logger.error(f"Launch error: {str(e)}")