Files changed (1) hide show
  1. ap.py +112 -0
ap.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio
2
+ from io import BytesIO
3
+ import fitz
4
+ import tempfile
5
+ import openai
6
+
7
+
8
+ class TranslationAgent:
9
+ def __init__(self, openai_key):
10
+ self.memory = []
11
+ system_msg = "You are a translator from english to italian.\n" \
12
+ " The only thing you do is to translate.\n" \
13
+ " You don't write anything other then the translation of the text you get.\n" \
14
+ " The user will only provide the text without asking anything, but what he wants is the translation.\n" \
15
+ " Never return the translation of a previously translated part!\n " \
16
+ "The text you will need to translate will often include none sense stuff because it is coming from a text extraction of a pdf file including images and table.\n" \
17
+ " Do your best to translate also this messy parts."
18
+
19
+ self.memory.append({"role": "system", "content": system_msg})
20
+
21
+ openai.api_key = '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'
22
+
23
+ def fade_memory(self):
24
+ if len(self.memory) >= 5:
25
+ del self.memory[1:3]
26
+
27
+ def translate_chunk(self, chunk):
28
+ self.memory.append({"role": "user", "content": chunk})
29
+ response = openai.ChatCompletion.create(
30
+ model="gpt-3.5-turbo",
31
+ messages=self.memory
32
+ )
33
+ reply = response["choices"][0]["message"]["content"]
34
+ self.memory.append({"role": "assistant", "content": reply})
35
+ self.fade_memory()
36
+ return reply
37
+
38
+
39
+ def extract_text_from_pdf(pdf, start, stop):
40
+ text = ""
41
+ with fitz.open(stream=BytesIO(pdf), filetype='pdf') as doc: # remove .read()
42
+ for i, page in enumerate(doc):
43
+ if start <= i:
44
+ if i <= stop:
45
+ text += page.get_text()
46
+ else:
47
+ break
48
+ return text
49
+
50
+
51
+ def split_text(text, chunk_size=100):
52
+ words = text.split()
53
+ chunks = []
54
+ current_chunk_words = []
55
+
56
+ for word in words:
57
+ current_chunk_words.append(word)
58
+ if word.endswith('.') and len(current_chunk_words) >= chunk_size:
59
+ chunks.append(' '.join(current_chunk_words))
60
+ current_chunk_words = []
61
+
62
+ # add the last chunk if any words remain
63
+ if current_chunk_words:
64
+ chunks.append(' '.join(current_chunk_words))
65
+
66
+ return chunks
67
+
68
+
69
+ def translate_pdf(openai_key, pdf, start, stop):
70
+ translator = TranslationAgent(openai_key)
71
+ translated_text = ""
72
+ error_message = "Translation Successful"
73
+
74
+ try:
75
+ # extract text
76
+ if pdf is not None:
77
+ text = extract_text_from_pdf(pdf, start=start, stop=stop)
78
+ chunks = split_text(text)
79
+
80
+ translated_chunks = []
81
+ for chunk in chunks:
82
+ translated_chunk = translator.translate_chunk(chunk)
83
+ translated_chunks.append(translated_chunk + " ")
84
+
85
+ translated_text = ' '.join(translated_chunks)
86
+ except Exception as e:
87
+ error_message = f"Translation Failed: {e}"
88
+
89
+ # Create a temporary file with a specific prefix
90
+ temp = tempfile.NamedTemporaryFile(delete=False, prefix="translatedPDF_", suffix=".txt")
91
+
92
+ # Write to the temporary file
93
+ with open(temp.name, 'w', encoding='utf-8') as f:
94
+ f.write(translated_text)
95
+
96
+ return translated_text, error_message, temp.name
97
+
98
+
99
+ iface = gr.Interface(
100
+ fn=translate_pdf,
101
+ inputs=[
102
+ gr.Textbox(lines=1, label="OpenAI API key",
103
+ placeholder="Enter your OpenAI API key here"),
104
+ gr.File(type="binary", label="PDF file", ),
105
+ gr.Number(label="Starting Page", ),
106
+ gr.Number(label="Final Page")
107
+ ],
108
+ outputs=["text", "text", gr.File(label="Translated Text File")],
109
+ title="Pdf Translator: English ==> Arabic",
110
+ )
111
+
112
+ iface.launch()