akarshan11 commited on
Commit
14515e7
·
verified ·
1 Parent(s): e68c325

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -114
app.py CHANGED
@@ -5,118 +5,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  import fitz # PyMuPDF for PDF processing
6
  import docx2txt # For DOCX processing
7
  from fpdf import FPDF # For creating PDF outputs
8
- import typing
9
- from typing import Any, Union
10
 
11
- # Add modified JSON schema handling functions
12
- def get_type(schema: Union[dict, bool]) -> str:
13
- """Get the type of a JSON schema.
14
-
15
- Args:
16
- schema: JSON schema object or boolean
17
-
18
- Returns:
19
- str: Type of the schema
20
- """
21
- if isinstance(schema, bool):
22
- return "boolean"
23
- if not isinstance(schema, dict):
24
- return "any"
25
- if "const" in schema:
26
- return "const"
27
- if "enum" in schema:
28
- return "enum"
29
- elif "type" in schema:
30
- return schema["type"]
31
- elif schema.get("$ref"):
32
- return "$ref"
33
- elif schema.get("oneOf"):
34
- return "oneOf"
35
- elif schema.get("anyOf"):
36
- return "anyOf"
37
- elif schema.get("allOf"):
38
- return "allOf"
39
- return "any"
40
-
41
- def _json_schema_to_python_type(schema: Any, defs: Any) -> str:
42
- """Convert JSON schema to Python type hint.
43
-
44
- Args:
45
- schema: JSON schema
46
- defs: Schema definitions
47
-
48
- Returns:
49
- str: Python type hint
50
- """
51
- if schema == {}:
52
- return "Any"
53
-
54
- type_ = get_type(schema)
55
-
56
- if type_ == "boolean":
57
- return "bool"
58
- elif type_ == "any":
59
- if isinstance(schema, dict) and "description" in schema and "json" in schema["description"]:
60
- return "str | float | bool | list | dict"
61
- return "Any"
62
- elif type_ == "$ref":
63
- return _json_schema_to_python_type(defs[schema["$ref"].split("/")[-1]], defs)
64
- elif type_ == "null":
65
- return "None"
66
- elif type_ == "const":
67
- return f"Literal[{schema['const']}]"
68
- elif type_ == "enum":
69
- return "Literal[" + ", ".join([f"'{str(v)}'" for v in schema["enum"]]) + "]"
70
- elif type_ == "integer":
71
- return "int"
72
- elif type_ == "string":
73
- return "str"
74
- elif type_ == "boolean":
75
- return "bool"
76
- elif type_ == "number":
77
- return "float"
78
- elif type_ == "array":
79
- items = schema.get("items", {})
80
- if isinstance(items, bool):
81
- return "list[Any]"
82
- if "prefixItems" in items:
83
- elements = ", ".join(
84
- [_json_schema_to_python_type(i, defs) for i in items["prefixItems"]]
85
- )
86
- return f"tuple[{elements}]"
87
- elif "prefixItems" in schema:
88
- elements = ", ".join(
89
- [_json_schema_to_python_type(i, defs) for i in schema["prefixItems"]]
90
- )
91
- return f"tuple[{elements}]"
92
- else:
93
- elements = _json_schema_to_python_type(items, defs)
94
- return f"list[{elements}]"
95
- elif type_ == "object":
96
- props = schema.get("properties", {})
97
-
98
- def get_desc(v):
99
- return f" ({v.get('description')})" if isinstance(v, dict) and v.get("description") else ""
100
-
101
- des = [
102
- f"{n}: {_json_schema_to_python_type(v, defs)}{get_desc(v)}"
103
- for n, v in props.items()
104
- if n != "$defs"
105
- ]
106
-
107
- if "additionalProperties" in schema:
108
- additional_properties = schema["additionalProperties"]
109
- if isinstance(additional_properties, bool):
110
- if additional_properties:
111
- des += ["str, Any"]
112
- else:
113
- des += [f"str, {_json_schema_to_python_type(additional_properties, defs)}"]
114
- des = ", ".join(des)
115
- return f"dict({des})"
116
- else:
117
- return "Any"
118
-
119
- # The rest of your original code remains the same
120
  # Load model and tokenizer
121
  model_name = "facebook/mbart-large-50-many-to-many-mmt"
122
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -126,7 +15,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
126
  device = "cuda" if torch.cuda.is_available() else "cpu"
127
  model = model.to(device)
128
 
129
- # Your LANGUAGES dictionary and other functions remain the same
130
  LANGUAGES = {
131
  # Major Global Languages
132
  "English": "en_XX",
@@ -149,8 +38,145 @@ LANGUAGES = {
149
  "Urdu": "ur_PK"
150
  }
151
 
152
- # Your file handling and translation functions remain the same
153
- # ... (rest of your original code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # Create Gradio interface
156
  with gr.Blocks(title="Indian Language Translator") as demo:
@@ -189,6 +215,12 @@ with gr.Blocks(title="Indian Language Translator") as demo:
189
  inputs=[file_input, source_lang_doc, target_lang_doc],
190
  outputs=[output_file, output_preview]
191
  )
 
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
  demo.launch(share=True)
 
5
  import fitz # PyMuPDF for PDF processing
6
  import docx2txt # For DOCX processing
7
  from fpdf import FPDF # For creating PDF outputs
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Load model and tokenizer
10
  model_name = "facebook/mbart-large-50-many-to-many-mmt"
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model = model.to(device)
17
 
18
+ # Language mappings
19
  LANGUAGES = {
20
  # Major Global Languages
21
  "English": "en_XX",
 
38
  "Urdu": "ur_PK"
39
  }
40
 
41
+ # Define translation function first
42
+ def translate(text: str, source_lang: str, target_lang: str, max_length: int = 1024) -> str:
43
+ """
44
+ Translate text from source language to target language.
45
+
46
+ Args:
47
+ text: Text to translate
48
+ source_lang: Source language name
49
+ target_lang: Target language name
50
+ max_length: Maximum length of input text
51
+
52
+ Returns:
53
+ str: Translated text
54
+ """
55
+ if not text:
56
+ return "No text provided for translation."
57
+
58
+ try:
59
+ # Get language codes
60
+ src_lang = LANGUAGES.get(source_lang)
61
+ tgt_lang = LANGUAGES.get(target_lang)
62
+
63
+ if not src_lang or not tgt_lang:
64
+ return "Source or target language not supported."
65
+
66
+ # Set tokenizer source language
67
+ tokenizer.src_lang = src_lang
68
+
69
+ # Prepare input
70
+ inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
71
+ inputs = {k: v.to(device) for k, v in inputs.items()}
72
+
73
+ # Generate translation
74
+ with torch.no_grad():
75
+ generated_tokens = model.generate(
76
+ **inputs,
77
+ forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
78
+ max_length=max_length,
79
+ num_beams=5,
80
+ early_stopping=True
81
+ )
82
+
83
+ # Decode translation
84
+ translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
85
+ return translation
86
+
87
+ except Exception as e:
88
+ return f"Translation error: {str(e)}"
89
+
90
+ # File handling functions
91
+ def extract_text_from_pdf(file_path: str) -> str:
92
+ """Extract text from a PDF file"""
93
+ text = ""
94
+ try:
95
+ doc = fitz.open(file_path)
96
+ for page in doc:
97
+ text += page.get_text()
98
+ return text
99
+ except Exception as e:
100
+ return f"Error extracting PDF text: {str(e)}"
101
+
102
+ def extract_text_from_docx(file_path: str) -> str:
103
+ """Extract text from a DOCX file"""
104
+ try:
105
+ return docx2txt.process(file_path)
106
+ except Exception as e:
107
+ return f"Error extracting DOCX text: {str(e)}"
108
+
109
+ def extract_text_from_txt(file_path: str) -> str:
110
+ """Extract text from a TXT file"""
111
+ try:
112
+ with open(file_path, 'r', encoding='utf-8') as file:
113
+ return file.read()
114
+ except UnicodeDecodeError:
115
+ try:
116
+ with open(file_path, 'r', encoding='latin-1') as file:
117
+ return file.read()
118
+ except Exception as e:
119
+ return f"Error extracting TXT text: {str(e)}"
120
+ except Exception as e:
121
+ return f"Error extracting TXT text: {str(e)}"
122
+
123
+ def save_as_pdf(text: str, output_path: str) -> str:
124
+ """Save text as PDF"""
125
+ pdf = FPDF()
126
+ pdf.add_page()
127
+ pdf.set_font("Arial", size=12)
128
+
129
+ try:
130
+ # Try UTF-8 first
131
+ pdf.multi_cell(0, 10, text)
132
+ except Exception:
133
+ try:
134
+ # Fall back to latin-1 with replacement
135
+ encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
136
+ pdf.multi_cell(0, 10, encoded_text)
137
+ except Exception as e:
138
+ return f"Error creating PDF: {str(e)}"
139
+
140
+ try:
141
+ pdf.output(output_path)
142
+ return output_path
143
+ except Exception as e:
144
+ return f"Error saving PDF: {str(e)}"
145
+
146
+ def process_file(file, source_lang: str, target_lang: str) -> tuple[str | None, str]:
147
+ """Process uploaded file and translate its content"""
148
+ if file is None:
149
+ return None, "No file uploaded."
150
+
151
+ try:
152
+ # Save uploaded file temporarily
153
+ temp_file_path = file.name
154
+
155
+ # Extract text based on file type
156
+ if temp_file_path.lower().endswith('.pdf'):
157
+ text = extract_text_from_pdf(temp_file_path)
158
+ elif temp_file_path.lower().endswith('.docx'):
159
+ text = extract_text_from_docx(temp_file_path)
160
+ elif temp_file_path.lower().endswith('.txt'):
161
+ text = extract_text_from_txt(temp_file_path)
162
+ else:
163
+ return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
164
+
165
+ # Translate the extracted text
166
+ translated_text = translate(text, source_lang, target_lang)
167
+
168
+ # Save translation as PDF
169
+ output_pdf_path = os.path.join(os.path.dirname(temp_file_path),
170
+ f"translated_{os.path.basename(temp_file_path)}.pdf")
171
+ result = save_as_pdf(translated_text, output_pdf_path)
172
+
173
+ if isinstance(result, str) and result.startswith("Error"):
174
+ return None, result
175
+
176
+ return output_pdf_path, translated_text
177
+
178
+ except Exception as e:
179
+ return None, f"Error processing file: {str(e)}"
180
 
181
  # Create Gradio interface
182
  with gr.Blocks(title="Indian Language Translator") as demo:
 
215
  inputs=[file_input, source_lang_doc, target_lang_doc],
216
  outputs=[output_file, output_preview]
217
  )
218
+
219
+ gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
220
+ gr.Markdown("### Features:")
221
+ gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
222
+ gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
223
+ gr.Markdown("- Document translation with PDF output")
224
 
225
  if __name__ == "__main__":
226
  demo.launch(share=True)