akarshan11 commited on
Commit
759b6cc
·
verified ·
1 Parent(s): a082b95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -161
app.py CHANGED
@@ -5,9 +5,120 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  import fitz # PyMuPDF for PDF processing
6
  import docx2txt # For DOCX processing
7
  from fpdf import FPDF # For creating PDF outputs
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Load model and tokenizer
10
- model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
 
@@ -15,7 +126,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model = model.to(device)
17
 
18
- # Reduced language list with focus on major languages and Indian languages
19
  LANGUAGES = {
20
  # Major Global Languages
21
  "English": "en_XX",
@@ -38,159 +149,8 @@ LANGUAGES = {
38
  "Urdu": "ur_PK"
39
  }
40
 
41
- # File extraction functions
42
- def extract_text_from_pdf(file_path):
43
- """Extract text from a PDF file"""
44
- text = ""
45
- try:
46
- doc = fitz.open(file_path)
47
- for page in doc:
48
- text += page.get_text()
49
- return text
50
- except Exception as e:
51
- return f"Error extracting PDF text: {str(e)}"
52
-
53
- def extract_text_from_docx(file_path):
54
- """Extract text from a DOCX file"""
55
- try:
56
- return docx2txt.process(file_path)
57
- except Exception as e:
58
- return f"Error extracting DOCX text: {str(e)}"
59
-
60
- def extract_text_from_txt(file_path):
61
- """Extract text from a TXT file"""
62
- try:
63
- with open(file_path, 'r', encoding='utf-8') as file:
64
- return file.read()
65
- except UnicodeDecodeError:
66
- try:
67
- with open(file_path, 'r', encoding='latin-1') as file:
68
- return file.read()
69
- except Exception as e:
70
- return f"Error extracting TXT text: {str(e)}"
71
- except Exception as e:
72
- return f"Error extracting TXT text: {str(e)}"
73
-
74
- def save_as_pdf(text, output_path):
75
- """Save text as PDF"""
76
- pdf = FPDF()
77
- pdf.add_page()
78
- pdf.set_font("Arial", size=12)
79
-
80
- # Handle encoding safely
81
- try:
82
- # Try UTF-8 first
83
- encoded_text = text
84
- pdf.multi_cell(0, 10, encoded_text)
85
- except Exception:
86
- try:
87
- # Fall back to latin-1 with replacement
88
- encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
89
- pdf.multi_cell(0, 10, encoded_text)
90
- except Exception as e:
91
- return f"Error creating PDF: {str(e)}"
92
-
93
- try:
94
- pdf.output(output_path)
95
- return output_path
96
- except Exception as e:
97
- return f"Error saving PDF: {str(e)}"
98
-
99
- def get_type(schema: dict | bool):
100
- """Updated get_type function to handle boolean schemas"""
101
- if isinstance(schema, bool):
102
- return "boolean"
103
- if "const" in schema:
104
- return "const"
105
- if "enum" in schema:
106
- return "enum"
107
- elif "type" in schema:
108
- return schema["type"]
109
- elif schema.get("$ref"):
110
- return "$ref"
111
- elif schema.get("oneOf"):
112
- return "oneOf"
113
- elif schema.get("anyOf"):
114
- return "anyOf"
115
- elif schema.get("allOf"):
116
- return "allOf"
117
- elif "type" not in schema:
118
- return {}
119
- else:
120
- raise ValueError(f"Cannot parse type for {schema}")
121
-
122
- # Translation function
123
- def translate(text, source_lang, target_lang, max_length=1024):
124
- """Translate text from source language to target language"""
125
- if not text:
126
- return "No text provided for translation."
127
-
128
- try:
129
- # Set source and target language
130
- src_lang = LANGUAGES.get(source_lang)
131
- tgt_lang = LANGUAGES.get(target_lang)
132
-
133
- if not src_lang or not tgt_lang:
134
- return "Source or target language not supported."
135
-
136
- # Set tokenizer source language
137
- tokenizer.src_lang = src_lang
138
-
139
- # Prepare input
140
- inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
141
- inputs = {k: v.to(device) for k, v in inputs.items()}
142
-
143
- # Generate translation
144
- with torch.no_grad():
145
- generated_tokens = model.generate(
146
- **inputs,
147
- forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
148
- max_length=max_length,
149
- num_beams=5,
150
- early_stopping=True
151
- )
152
-
153
- # Decode translation
154
- translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
155
- return translation
156
-
157
- except Exception as e:
158
- return f"Translation error: {str(e)}"
159
-
160
- # Process uploads and handle translation
161
- def process_file(file, source_lang, target_lang):
162
- """Process uploaded file and translate its content"""
163
- if file is None:
164
- return None, "No file uploaded."
165
-
166
- try:
167
- # Save uploaded file temporarily
168
- temp_file_path = file.name
169
-
170
- # Extract text based on file type
171
- if temp_file_path.lower().endswith('.pdf'):
172
- text = extract_text_from_pdf(temp_file_path)
173
- elif temp_file_path.lower().endswith('.docx'):
174
- text = extract_text_from_docx(temp_file_path)
175
- elif temp_file_path.lower().endswith('.txt'):
176
- text = extract_text_from_txt(temp_file_path)
177
- else:
178
- return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
179
-
180
- # Translate the extracted text
181
- translated_text = translate(text, source_lang, target_lang)
182
-
183
- # Save translation as PDF
184
- output_pdf_path = os.path.join(os.path.dirname(temp_file_path), f"translated_{os.path.basename(temp_file_path)}.pdf")
185
- result = save_as_pdf(translated_text, output_pdf_path)
186
-
187
- if isinstance(result, str) and result.startswith("Error"):
188
- return None, result
189
-
190
- return output_pdf_path, translated_text
191
-
192
- except Exception as e:
193
- return None, f"Error processing file: {str(e)}"
194
 
195
  # Create Gradio interface
196
  with gr.Blocks(title="Indian Language Translator") as demo:
@@ -229,12 +189,6 @@ with gr.Blocks(title="Indian Language Translator") as demo:
229
  inputs=[file_input, source_lang_doc, target_lang_doc],
230
  outputs=[output_file, output_preview]
231
  )
232
-
233
- gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
234
- gr.Markdown("### Features:")
235
- gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
236
- gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
237
- gr.Markdown("- Document translation with PDF output")
238
 
239
  if __name__ == "__main__":
240
  demo.launch(share=True)
 
5
  import fitz # PyMuPDF for PDF processing
6
  import docx2txt # For DOCX processing
7
  from fpdf import FPDF # For creating PDF outputs
8
+ import typing
9
+ from typing import Any, Union
10
 
11
+ # Add modified JSON schema handling functions
12
+ def get_type(schema: Union[dict, bool]) -> str:
13
+ """Get the type of a JSON schema.
14
+
15
+ Args:
16
+ schema: JSON schema object or boolean
17
+
18
+ Returns:
19
+ str: Type of the schema
20
+ """
21
+ if isinstance(schema, bool):
22
+ return "boolean"
23
+ if not isinstance(schema, dict):
24
+ return "any"
25
+ if "const" in schema:
26
+ return "const"
27
+ if "enum" in schema:
28
+ return "enum"
29
+ elif "type" in schema:
30
+ return schema["type"]
31
+ elif schema.get("$ref"):
32
+ return "$ref"
33
+ elif schema.get("oneOf"):
34
+ return "oneOf"
35
+ elif schema.get("anyOf"):
36
+ return "anyOf"
37
+ elif schema.get("allOf"):
38
+ return "allOf"
39
+ return "any"
40
+
41
+ def _json_schema_to_python_type(schema: Any, defs: Any) -> str:
42
+ """Convert JSON schema to Python type hint.
43
+
44
+ Args:
45
+ schema: JSON schema
46
+ defs: Schema definitions
47
+
48
+ Returns:
49
+ str: Python type hint
50
+ """
51
+ if schema == {}:
52
+ return "Any"
53
+
54
+ type_ = get_type(schema)
55
+
56
+ if type_ == "boolean":
57
+ return "bool"
58
+ elif type_ == "any":
59
+ if isinstance(schema, dict) and "description" in schema and "json" in schema["description"]:
60
+ return "str | float | bool | list | dict"
61
+ return "Any"
62
+ elif type_ == "$ref":
63
+ return _json_schema_to_python_type(defs[schema["$ref"].split("/")[-1]], defs)
64
+ elif type_ == "null":
65
+ return "None"
66
+ elif type_ == "const":
67
+ return f"Literal[{schema['const']}]"
68
+ elif type_ == "enum":
69
+ return "Literal[" + ", ".join([f"'{str(v)}'" for v in schema["enum"]]) + "]"
70
+ elif type_ == "integer":
71
+ return "int"
72
+ elif type_ == "string":
73
+ return "str"
74
+ elif type_ == "boolean":
75
+ return "bool"
76
+ elif type_ == "number":
77
+ return "float"
78
+ elif type_ == "array":
79
+ items = schema.get("items", {})
80
+ if isinstance(items, bool):
81
+ return "list[Any]"
82
+ if "prefixItems" in items:
83
+ elements = ", ".join(
84
+ [_json_schema_to_python_type(i, defs) for i in items["prefixItems"]]
85
+ )
86
+ return f"tuple[{elements}]"
87
+ elif "prefixItems" in schema:
88
+ elements = ", ".join(
89
+ [_json_schema_to_python_type(i, defs) for i in schema["prefixItems"]]
90
+ )
91
+ return f"tuple[{elements}]"
92
+ else:
93
+ elements = _json_schema_to_python_type(items, defs)
94
+ return f"list[{elements}]"
95
+ elif type_ == "object":
96
+ props = schema.get("properties", {})
97
+
98
+ def get_desc(v):
99
+ return f" ({v.get('description')})" if isinstance(v, dict) and v.get("description") else ""
100
+
101
+ des = [
102
+ f"{n}: {_json_schema_to_python_type(v, defs)}{get_desc(v)}"
103
+ for n, v in props.items()
104
+ if n != "$defs"
105
+ ]
106
+
107
+ if "additionalProperties" in schema:
108
+ additional_properties = schema["additionalProperties"]
109
+ if isinstance(additional_properties, bool):
110
+ if additional_properties:
111
+ des += ["str, Any"]
112
+ else:
113
+ des += [f"str, {_json_schema_to_python_type(additional_properties, defs)}"]
114
+ des = ", ".join(des)
115
+ return f"dict({des})"
116
+ else:
117
+ return "Any"
118
+
119
+ # The rest of your original code remains the same
120
  # Load model and tokenizer
121
+ model_name = "facebook/mbart-large-50-many-to-many-mmt"
122
  tokenizer = AutoTokenizer.from_pretrained(model_name)
123
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
124
 
 
126
  device = "cuda" if torch.cuda.is_available() else "cpu"
127
  model = model.to(device)
128
 
129
+ # Your LANGUAGES dictionary and other functions remain the same
130
  LANGUAGES = {
131
  # Major Global Languages
132
  "English": "en_XX",
 
149
  "Urdu": "ur_PK"
150
  }
151
 
152
+ # Your file handling and translation functions remain the same
153
+ # ... (rest of your original code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # Create Gradio interface
156
  with gr.Blocks(title="Indian Language Translator") as demo:
 
189
  inputs=[file_input, source_lang_doc, target_lang_doc],
190
  outputs=[output_file, output_preview]
191
  )
 
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
  demo.launch(share=True)