|
import os |
|
import io |
|
import re |
|
import streamlit as st |
|
|
|
|
|
st.set_page_config(layout="wide", initial_sidebar_state="collapsed") |
|
|
|
from PIL import Image |
|
import fitz |
|
|
|
from reportlab.lib.pagesizes import A4 |
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle |
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
|
from reportlab.lib import colors |
|
from reportlab.pdfbase import pdfmetrics |
|
from reportlab.pdfbase.ttfonts import TTFont |
|
|
|
|
|
|
|
|
|
available_fonts = { |
|
"NotoEmoji Variable": "NotoEmoji-VariableFont_wght.ttf", |
|
"NotoEmoji Bold": "NotoEmoji-Bold.ttf", |
|
"NotoEmoji Light": "NotoEmoji-Light.ttf", |
|
"NotoEmoji Medium": "NotoEmoji-Medium.ttf", |
|
"NotoEmoji Regular": "NotoEmoji-Regular.ttf", |
|
"NotoEmoji SemiBold": "NotoEmoji-SemiBold.ttf" |
|
} |
|
|
|
|
|
selected_font_name = st.sidebar.selectbox( |
|
"Select NotoEmoji Font", |
|
options=list(available_fonts.keys()) |
|
) |
|
selected_font_path = available_fonts[selected_font_name] |
|
|
|
|
|
pdfmetrics.registerFont(TTFont(selected_font_name, selected_font_path)) |
|
|
|
|
|
|
|
default_markdown = """# Cutting-Edge ML Outline |
|
|
|
## Core ML Techniques |
|
1. π **Mixture of Experts (MoE)** |
|
- Conditional computation techniques |
|
- Sparse gating mechanisms |
|
- Training specialized sub-models |
|
|
|
2. π₯ **Supervised Fine-Tuning (SFT) using PyTorch** |
|
- Loss function customization |
|
- Gradient accumulation strategies |
|
- Learning rate schedulers |
|
|
|
3. π€ **Large Language Models (LLM) using Transformers** |
|
- Attention mechanisms |
|
- Tokenization strategies |
|
- Position encodings |
|
|
|
## Training Methods |
|
4. π **Self-Rewarding Learning using NPS 0-10 and Verbatims** |
|
- Custom reward functions |
|
- Feedback categorization |
|
- Signal extraction from text |
|
|
|
5. π **Reinforcement Learning from Human Feedback (RLHF)** |
|
- Preference datasets |
|
- PPO implementation |
|
- KL divergence constraints |
|
|
|
6. π **MergeKit: Merging Models to Same Embedding Space** |
|
- TIES merging |
|
- Task arithmetic |
|
- SLERP interpolation |
|
|
|
## Optimization & Deployment |
|
7. π **DistillKit: Model Size Reduction with Spectrum Analysis** |
|
- Knowledge distillation |
|
- Quantization techniques |
|
- Model pruning strategies |
|
|
|
8. π§ **Agentic RAG Agents using Document Inputs** |
|
- Vector database integration |
|
- Query planning |
|
- Self-reflection mechanisms |
|
|
|
9. β³ **Longitudinal Data Summarization from Multiple Docs** |
|
- Multi-document compression |
|
- Timeline extraction |
|
- Entity tracking |
|
|
|
## Knowledge Representation |
|
10. π **Knowledge Extraction using Markdown Knowledge Graphs** |
|
- Entity recognition |
|
- Relationship mapping |
|
- Hierarchical structuring |
|
|
|
11. πΊοΈ **Knowledge Mapping with Mermaid Diagrams** |
|
- Flowchart generation |
|
- Sequence diagram creation |
|
- State diagrams |
|
|
|
12. π» **ML Code Generation with Streamlit/Gradio/HTML5+JS** |
|
- Code completion |
|
- Unit test generation |
|
- Documentation synthesis |
|
""" |
|
|
|
|
|
|
|
def markdown_to_pdf_content(markdown_text): |
|
lines = markdown_text.strip().split('\n') |
|
pdf_content = [] |
|
in_list_item = False |
|
current_item = None |
|
sub_items = [] |
|
|
|
for line in lines: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
|
|
if line.startswith('# '): |
|
|
|
pass |
|
elif line.startswith('## '): |
|
if current_item and sub_items: |
|
pdf_content.append([current_item, sub_items]) |
|
sub_items = [] |
|
current_item = None |
|
section = line.replace('## ', '').strip() |
|
pdf_content.append(f"<b>{section}</b>") |
|
in_list_item = False |
|
elif re.match(r'^\d+\.', line): |
|
if current_item and sub_items: |
|
pdf_content.append([current_item, sub_items]) |
|
sub_items = [] |
|
current_item = line.strip() |
|
in_list_item = True |
|
elif line.startswith('- ') and in_list_item: |
|
sub_items.append(line.strip()) |
|
else: |
|
if not in_list_item: |
|
pdf_content.append(line.strip()) |
|
|
|
if current_item and sub_items: |
|
pdf_content.append([current_item, sub_items]) |
|
|
|
mid_point = len(pdf_content) // 2 |
|
left_column = pdf_content[:mid_point] |
|
right_column = pdf_content[mid_point:] |
|
|
|
return left_column, right_column |
|
|
|
|
|
|
|
def create_main_pdf(markdown_text, base_font_size=10, auto_size=False): |
|
buffer = io.BytesIO() |
|
doc = SimpleDocTemplate( |
|
buffer, |
|
pagesize=(A4[1], A4[0]), |
|
leftMargin=36, |
|
rightMargin=36, |
|
topMargin=36, |
|
bottomMargin=36 |
|
) |
|
|
|
styles = getSampleStyleSheet() |
|
story = [] |
|
spacer_height = 10 |
|
left_column, right_column = markdown_to_pdf_content(markdown_text) |
|
|
|
|
|
total_items = 0 |
|
for col in (left_column, right_column): |
|
for item in col: |
|
if isinstance(item, list): |
|
main_item, sub_items |
|
|