Sourudra commited on
Commit
0c2178e
·
verified ·
1 Parent(s): a775228

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import docx # Importing the required module for DOCX extraction
2
+ from datasketch import MinHash, MinHashLSH # Importing MinHash and LSH from datasketch
3
+ import gradio as gr # Importing Gradio for creating the web interface
4
+
5
+ # Function to extract text from DOCX files
6
+ def extract_text_from_docx(docx_path):
7
+ try:
8
+ doc = docx.Document(docx_path) # Open the DOCX file
9
+ text = "\n".join([para.text for para in doc.paragraphs]) # Extract the text from paragraphs
10
+ return text
11
+ except Exception as e:
12
+ print(f"Error extracting text from DOCX: {str(e)}")
13
+ return ""
14
+
15
+ # Function to calculate MinHash-based similarity between two texts
16
+ def calculate_similarity(doc1, doc2):
17
+ def text_to_shingles(text, k=5):
18
+ # Split the text into k-grams (shingles) of length k
19
+ shingles = set()
20
+ for i in range(len(text) - k + 1):
21
+ shingles.add(text[i:i + k])
22
+ return shingles
23
+
24
+ # Generate shingles for both documents
25
+ shingles1 = text_to_shingles(doc1)
26
+ shingles2 = text_to_shingles(doc2)
27
+
28
+ # Compute MinHash signatures
29
+ minhash1 = MinHash(num_perm=128)
30
+ minhash2 = MinHash(num_perm=128)
31
+
32
+ for shingle in shingles1:
33
+ minhash1.update(shingle.encode('utf8'))
34
+
35
+ for shingle in shingles2:
36
+ minhash2.update(shingle.encode('utf8'))
37
+
38
+ # Compute Jaccard similarity using MinHash
39
+ similarity_score = minhash1.jaccard(minhash2)
40
+ return similarity_score
41
+
42
+ # Function to handle the file upload and similarity calculation
43
+ def similarity(file1, file2):
44
+ if file1.name.endswith('.docx'):
45
+ text1 = extract_text_from_docx(file1.name)
46
+ else:
47
+ return "File type not supported. Please upload a DOCX file."
48
+
49
+ if file2.name.endswith('.docx'):
50
+ text2 = extract_text_from_docx(file2.name)
51
+ else:
52
+ return "File type not supported. Please upload a DOCX file."
53
+
54
+ return calculate_similarity(text1, text2)
55
+
56
+ # Create a Gradio interface
57
+ with gr.Blocks() as demo:
58
+ gr.Markdown("## Document Similarity Checker for DOCX files")
59
+ file1 = gr.File(label="Upload Document 1 (DOCX)")
60
+ file2 = gr.File(label="Upload Document 2 (DOCX)")
61
+ output = gr.Textbox(label="Similarity Score")
62
+ submit = gr.Button("Submit")
63
+
64
+ submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
65
+
66
+ # Launching the Streamlit app
67
+ demo.launch()