Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 8

Commit

218d2f0

verified ·

1 Parent(s): 0f3f863

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -221

app.py CHANGED Viewed

@@ -17,280 +17,215 @@ import traceback
 # Configuration
 MAX_THREADS = 4
 SUPPORTED_MODELS = {
-    "Deepseek": "deepseek-chat",
-    "Llama-3-70B": "meta-llama/Meta-Llama-3-70B-Instruct",
-    "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
 }
 def initialize_session_state():
-    """Initialize all session state variables"""
-    defaults = {
         'document_data': [],
         'qa_pairs': [],
         'processing_complete': False,
         'current_stage': 'idle',
         'api_keys': {},
         'model_choice': "Deepseek",
-        'temperature': 0.3
     }
-    for key, value in defaults.items():
         if key not in st.session_state:
             st.session_state[key] = value
-def secure_api_management():
-    """Advanced API key management with encryption"""
-    with st.sidebar:
-        st.header("🔑 API Management")
-        provider = st.selectbox("Provider", list(SUPPORTED_MODELS.keys()))
-        new_key = st.text_input(f"Enter {provider} API Key", type="password")
-        if st.button("Store Key"):
-            if new_key:
-                hashed_key = hashlib.sha256(new_key.encode()).hexdigest()
-                st.session_state.api_keys[provider] = hashed_key
-                st.success("Key stored securely")
-            else:
-                st.error("Please enter a valid API key")
 def process_image(img_data, page_num, img_idx):
-    """Advanced image processing with error handling"""
     try:
         img = img_data["stream"]
         width = int(img_data["width"])
         height = int(img_data["height"])
-        # Determine color mode
-        color_space = getattr(img, "colorspace", "")
-        mode = "RGB"
-        if "/DeviceCMYK" in str(color_space):
-            mode = "CMYK"
-        elif "/DeviceGray" in str(color_space):
-            mode = "L"
         # Convert image to RGB
-        image = Image.frombytes(mode, (width, height), img.get_data())
-        if mode != "RGB":
-            image = image.convert("RGB")
-        return image
-    except Exception as e:
-        st.error(f"Image processing error (Page {page_num}, Image {img_idx}): {str(e)[:100]}")
-        return None
-def process_page(page_data):
-    """Thread-safe page processing"""
-    page_num, page = page_data
-    try:
-        text = page.extract_text() or ""
-        images = []
-        for idx, img in enumerate(page.images):
-            processed_image = process_image(img, page_num, idx)
-            if processed_image:
-                images.append(processed_image)
-        return {"page": page_num, "text": text.strip(), "images": images}
     except Exception as e:
-        st.error(f"Page {page_num} error: {str(e)[:100]}")
         return None
-def advanced_pdf_processor(uploaded_file):
-    """Multi-threaded PDF processing with real-time updates"""
-    st.session_state.document_data = []
-    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
         with pdfplumber.open(uploaded_file) as pdf:
-            future = executor.submit(
-                lambda: list(executor.map(process_page, enumerate(pdf.pages, 1)))
-            )
-            while not future.done():
-                time.sleep(0.1)
-                st.rerun()
-            results = future.result()
-            for result in results:
-                if result:
-                    st.session_state.document_data.append(result)
-                    st.rerun()
-def hybrid_text_extraction(entry):
-    """Multimodal text extraction with fallback"""
-    text_content = entry["text"]
-    if not text_content and entry["images"]:
-        ocr_results = []
-        for img in entry["images"]:
-            try:
-                ocr_results.append(pytesseract.image_to_string(img))
-            except Exception as e:
-                st.warning(f"OCR failed: {str(e)[:100]}")
-        text_content = " ".join(ocr_results).strip()
-    return text_content
-def generate_with_retry(model, messages):
-    """Enterprise-grade LLM generation with retry logic"""
-    client = openai.OpenAI(
-        base_url="https://api.deepseek.com/v1",
-        api_key=st.secrets.get("DEEPSEEK_API_KEY")
-    )
-    for attempt in range(3):
-        try:
             response = client.chat.completions.create(
-                model=SUPPORTED_MODELS[model],
-                messages=messages,
                 max_tokens=2048,
                 response_format={"type": "json_object"},
                 temperature=st.session_state.temperature
             )
-            return json.loads(response.choices[0].message.content)
-        except Exception as e:
-            if attempt == 2:
-                raise
-            time.sleep(2 ** attempt)
-def qa_generation_workflow():
-    """Enterprise Q&A generation pipeline"""
-    with st.status("🚀 AI Processing Pipeline", expanded=True) as status:
-        try:
-            st.write("Initializing neural processors...")
-            total_pages = len(st.session_state.document_data)
-            qa_pairs = []
-            for idx, entry in enumerate(st.session_state.document_data):
-                status.write(f"Processing page {idx+1}/{total_pages}")
-                text_content = hybrid_text_extraction(entry)
-                prompt = f"""Generate 3 sophisticated Q&A pairs from:
-                Page {entry['page']} Content:
-                {text_content}
-                Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
-                response = generate_with_retry(
-                    st.session_state.model_choice,
-                    [{"role": "user", "content": prompt}]
-                )
-                qa_pairs.extend(response.get("qa_pairs", []))
-            st.session_state.qa_pairs = qa_pairs
-            status.update(label="Processing complete ✅", state="complete")
-        except Exception as e:
-            status.error(f"Processing failed: {traceback.format_exc()[:500]}")
-            st.session_state.processing_complete = False
-def evaluation_interface():
-    """Interactive quality control center"""
-    st.header("🧪 Quality Control Hub")
-    with st.expander("Automated AI Evaluation", expanded=True):
-        if st.button("Run Batch Validation"):
-            with st.spinner("Validating responses..."):
-                time.sleep(2)  # Simulated validation
-                st.success("Quality check passed: 98% accuracy")
-    with st.expander("Human-in-the-Loop Review"):
-        sample_size = min(5, len(st.session_state.qa_pairs))
-        for idx in range(sample_size):
-            pair = st.session_state.qa_pairs[idx]
-            with st.container(border=True):
-                col1, col2 = st.columns([1, 3])
-                with col1:
-                    st.metric("Page", pair["page"])
-                with col2:
-                    st.write(f"**Question:** {pair['question']}")
-                tab1, tab2 = st.tabs(["Answer 1", "Answer 2"])
-                with tab1:
-                    st.write(pair["answer_1"])
-                with tab2:
-                    st.write(pair["answer_2"])
-                st.selectbox(
-                    "Select preferred answer",
-                    ["Answer 1", "Answer 2", "Needs Review"],
-                    key=f"eval_{idx}"
-                )
-def data_export_module():
-    """Enterprise-grade data export system"""
-    st.header("📦 Data Packaging")
-    col1, col2, col3 = st.columns(3)
-    with col1:
-        export_format = st.selectbox("Format", ["JSON", "CSV", "Parquet"])
-    with col2:
-        compression = st.selectbox("Compression", ["None", "gzip", "zip"])
-    with col3:
-        include_metadata = st.checkbox("Include Metadata", True)
-    if st.button("Generate Export Package"):
-        with st.spinner("Packaging data..."):
-            df = pd.DataFrame(st.session_state.qa_pairs)
-            buffer = BytesIO()
-            if export_format == "JSON":
-                df.to_json(buffer, orient="records", indent=2)
-                mime = "application/json"
-            elif export_format == "CSV":
-                df.to_csv(buffer, index=False)
-                mime = "text/csv"
-            else:
-                df.to_parquet(buffer, compression=compression if compression != "None" else None)
-                mime = "application/octet-stream"
-            st.download_button(
-                label="Download Dataset",
-                data=buffer.getvalue(),
-                file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
-                mime=mime
-            )
-def main_interface():
-    """Core application interface"""
-    st.title("🏭 Synthetic Data Factory")
-    st.write("Industrial-scale synthetic data generation powered by cutting-edge AI")
-    # Processing pipeline
-    if uploaded_file := st.sidebar.file_uploader("Upload PDF Document", type=["pdf"]):
-        if st.sidebar.button("Start Generation"):
-            st.session_state.processing_complete = False
-            advanced_pdf_processor(uploaded_file)
-            qa_generation_workflow()
-            st.session_state.processing_complete = True
-    # Display results
-    if st.session_state.processing_complete:
-        evaluation_interface()
-        data_export_module()
 def main():
-    """Main application entry point"""
     st.set_page_config(
-        page_title="Synthetic Data Factory",
-        page_icon="🏭",
         layout="wide"
     )
     initialize_session_state()
-    secure_api_management()
     with st.sidebar:
-        st.header("⚙️ Engine Configuration")
         st.session_state.model_choice = st.selectbox(
-            "AI Model",
-            list(SUPPORTED_MODELS.keys())
         )
         st.session_state.temperature = st.slider(
-            "Creativity Level",
-            0.0, 1.0, 0.3
         )
-    main_interface()
 if __name__ == "__main__":
     main()

 # Configuration
 MAX_THREADS = 4
 SUPPORTED_MODELS = {
+    "Deepseek": {
+        "model": "deepseek-chat",
+        "base_url": "https://api.deepseek.com/v1"
+    }
 }
+def debug_log(message):
+    """Enhanced logging system"""
+    if st.session_state.get("debug_mode"):
+        st.toast(f"DEBUG: {message}", icon="🐛")
 def initialize_session_state():
+    """Initialize all session state variables with validation"""
+    required_keys = {
         'document_data': [],
         'qa_pairs': [],
         'processing_complete': False,
         'current_stage': 'idle',
         'api_keys': {},
         'model_choice': "Deepseek",
+        'temperature': 0.3,
+        'debug_mode': True
     }
+    for key, value in required_keys.items():
         if key not in st.session_state:
             st.session_state[key] = value
+def show_processing_status():
+    """Visual feedback system"""
+    status_messages = {
+        'idle': "🟢 Ready to process",
+        'extracting': "🔍 Extracting document content...",
+        'generating': "🧠 Generating Q&A pairs...",
+        'evaluating': "📊 Evaluating results...",
+        'error': "❌ Processing failed"
+    }
+    status = st.session_state.current_stage
+    debug_log(f"Status update: {status}")
+    st.sidebar.markdown(f"**System Status:** {status_messages.get(status, 'Unknown')}")
 def process_image(img_data, page_num, img_idx):
+    """Robust image processing with validation"""
     try:
         img = img_data["stream"]
         width = int(img_data["width"])
         height = int(img_data["height"])
+        debug_log(f"Processing image {img_idx} on page {page_num}")
         # Convert image to RGB
+        try:
+            return Image.frombytes("RGB", (width, height), img.get_data())
+        except:
+            return Image.frombytes("L", (width, height), img.get_data()).convert("RGB")
     except Exception as e:
+        st.error(f"Image processing failed (Page {page_num}, Image {img_idx}): {str(e)}")
         return None
+def pdf_processing_workflow(uploaded_file):
+    """PDF processing with real-time feedback"""
+    st.session_state.current_stage = 'extracting'
+    try:
         with pdfplumber.open(uploaded_file) as pdf:
+            total_pages = len(pdf.pages)
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            for page_num, page in enumerate(pdf.pages, 1):
+                status_text.text(f"Processing page {page_num}/{total_pages}")
+                progress_bar.progress(page_num/total_pages)
+                try:
+                    text = page.extract_text() or ""
+                    images = [process_image(img, page_num, idx)
+                            for idx, img in enumerate(page.images)]
+                    st.session_state.document_data.append({
+                        "page": page_num,
+                        "text": text.strip(),
+                        "images": [img for img in images if img is not None]
+                    })
+                except Exception as e:
+                    st.error(f"Page {page_num} error: {str(e)}")
+                time.sleep(0.1)  # Simulate processing
+            progress_bar.empty()
+            status_text.success("Document processing complete!")
+            return True
+    except Exception as e:
+        st.session_state.current_stage = 'error'
+        st.error(f"PDF processing failed: {str(e)}")
+        debug_log(traceback.format_exc())
+        return False
+def generate_qa_pairs():
+    """Q&A generation with validation"""
+    st.session_state.current_stage = 'generating'
+    qa_pairs = []
+    try:
+        client = openai.OpenAI(
+            base_url=SUPPORTED_MODELS[st.session_state.model_choice]["base_url"],
+            api_key=st.secrets["DEEPSEEK_API_KEY"]
+        )
+        for idx, entry in enumerate(st.session_state.document_data):
+            text_content = entry["text"] or " ".join([
+                pytesseract.image_to_string(img) for img in entry["images"]
+            ])
             response = client.chat.completions.create(
+                model=SUPPORTED_MODELS[st.session_state.model_choice]["model"],
+                messages=[{
+                    "role": "user",
+                    "content": f"Generate 3 Q&A pairs from:\n{text_content}\nReturn JSON format: {{'qa_pairs': [{{'question': '...', 'answer_1': '...', 'answer_2': '...'}}]}}"
+                }],
                 max_tokens=2048,
                 response_format={"type": "json_object"},
                 temperature=st.session_state.temperature
             )
+            try:
+                result = json.loads(response.choices[0].message.content)
+                qa_pairs.extend(result.get("qa_pairs", []))
+                debug_log(f"Generated {len(result.get('qa_pairs', []))} pairs for page {entry['page']}")
+            except json.JSONDecodeError:
+                st.error(f"Invalid response format from API for page {entry['page']}")
+        st.session_state.qa_pairs = qa_pairs
+        st.session_state.current_stage = 'evaluating'
+        return True
+    except Exception as e:
+        st.session_state.current_stage = 'error'
+        st.error(f"Q&A generation failed: {str(e)}")
+        debug_log(traceback.format_exc())
+        return False
 def main():
+    """Main application interface"""
     st.set_page_config(
+        page_title="Synthetic Data Generator",
+        page_icon="🧪",
         layout="wide"
     )
     initialize_session_state()
+    # Debug panel
     with st.sidebar:
+        st.header("⚙️ Configuration")
         st.session_state.model_choice = st.selectbox(
+            "AI Model", list(SUPPORTED_MODELS.keys())
         )
         st.session_state.temperature = st.slider(
+            "Creativity Level", 0.0, 1.0, 0.3
         )
+        st.session_state.debug_mode = st.checkbox("Debug Mode", True)
+        show_processing_status()
+    st.title("🧪 Synthetic Data Generator")
+    # File upload section
+    uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
+    if uploaded_file and st.button("Start Processing"):
+        if pdf_processing_workflow(uploaded_file):
+            if generate_qa_pairs():
+                st.success("Processing completed successfully!")
+                # Show results
+                st.header("Generated Q&A Pairs")
+                for idx, pair in enumerate(st.session_state.qa_pairs[:10]):
+                    with st.expander(f"Q{idx+1}: {pair['question']}"):
+                        st.write(f"**Answer 1:** {pair['answer_1']}")
+                        st.write(f"**Answer 2:** {pair['answer_2']}")
+                # Data export
+                st.header("Data Export")
+                df = pd.DataFrame(st.session_state.qa_pairs)
+                st.download_button(
+                    label="Download as CSV",
+                    data=df.to_csv(index=False).encode('utf-8'),
+                    file_name="synthetic_data.csv",
+                    mime="text/csv"
+                )
+    # Debug information
+    if st.session_state.debug_mode:
+        with st.expander("Debug Information"):
+            st.write("### Session State")
+            st.json(st.session_state)
+            if st.session_state.get("document_data"):
+                st.write("### Document Data Summary")
+                st.write(f"Pages processed: {len(st.session_state.document_data)}")
+                st.write(f"Total images extracted: {sum(len(p['images']) for p in st.session_state.document_data)}")
+            if st.session_state.get("qa_pairs"):
+                st.write("### Q&A Statistics")
+                st.write(f"Total pairs generated: {len(st.session_state.qa_pairs)}")
+                st.write("Sample Q&A pairs:")
+                st.table(pd.DataFrame(st.session_state.qa_pairs[:3]))
 if __name__ == "__main__":
     main()