Sourudra commited on
Commit
96b2151
·
verified ·
1 Parent(s): ed4fadc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -67
app.py CHANGED
@@ -1,67 +1,80 @@
1
- import docx # Importing the required module for DOCX extraction
2
- from datasketch import MinHash, MinHashLSH # Importing MinHash and LSH from datasketch
3
- import gradio as gr # Importing Gradio for creating the web interface
4
-
5
- # Function to extract text from DOCX files
6
- def extract_text_from_docx(docx_path):
7
- try:
8
- doc = docx.Document(docx_path) # Open the DOCX file
9
- text = "\n".join([para.text for para in doc.paragraphs]) # Extract the text from paragraphs
10
- return text
11
- except Exception as e:
12
- print(f"Error extracting text from DOCX: {str(e)}")
13
- return ""
14
-
15
- # Function to calculate MinHash-based similarity between two texts
16
- def calculate_similarity(doc1, doc2):
17
- def text_to_shingles(text, k=5):
18
- # Split the text into k-grams (shingles) of length k
19
- shingles = set()
20
- for i in range(len(text) - k + 1):
21
- shingles.add(text[i:i + k])
22
- return shingles
23
-
24
- # Generate shingles for both documents
25
- shingles1 = text_to_shingles(doc1)
26
- shingles2 = text_to_shingles(doc2)
27
-
28
- # Compute MinHash signatures
29
- minhash1 = MinHash(num_perm=128)
30
- minhash2 = MinHash(num_perm=128)
31
-
32
- for shingle in shingles1:
33
- minhash1.update(shingle.encode('utf8'))
34
-
35
- for shingle in shingles2:
36
- minhash2.update(shingle.encode('utf8'))
37
-
38
- # Compute Jaccard similarity using MinHash
39
- similarity_score = minhash1.jaccard(minhash2)
40
- return similarity_score
41
-
42
- # Function to handle the file upload and similarity calculation
43
- def similarity(file1, file2):
44
- if file1.name.endswith('.docx'):
45
- text1 = extract_text_from_docx(file1.name)
46
- else:
47
- return "File type not supported. Please upload a DOCX file."
48
-
49
- if file2.name.endswith('.docx'):
50
- text2 = extract_text_from_docx(file2.name)
51
- else:
52
- return "File type not supported. Please upload a DOCX file."
53
-
54
- return calculate_similarity(text1, text2)
55
-
56
- # Create a Gradio interface
57
- with gr.Blocks() as demo:
58
- gr.Markdown("## Document Similarity Checker for DOCX files")
59
- file1 = gr.File(label="Upload Document 1 (DOCX)")
60
- file2 = gr.File(label="Upload Document 2 (DOCX)")
61
- output = gr.Textbox(label="Similarity Score")
62
- submit = gr.Button("Submit")
63
-
64
- submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
65
-
66
- # Launching the Streamlit app
67
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import docx # Importing the required module for DOCX extraction
2
+ from datasketch import MinHash, MinHashLSH # Importing MinHash and LSH from datasketch
3
+ import gradio as gr # Importing Gradio for creating the web interface
4
+
5
+ # Function to extract text from DOCX files
6
+ def extract_text_from_docx(docx_path):
7
+ try:
8
+ doc = docx.Document(docx_path) # Open the DOCX file
9
+ text = "\n".join([para.text for para in doc.paragraphs]) # Extract the text from paragraphs
10
+ return text
11
+ except Exception as e:
12
+ print(f"Error extracting text from DOCX: {str(e)}")
13
+ return ""
14
+
15
+ # Function to calculate MinHash-based similarity between two texts
16
+ def calculate_similarity(doc1, doc2):
17
+ def text_to_shingles(text, k=5):
18
+ # Split the text into k-grams (shingles) of length k
19
+ shingles = set()
20
+ for i in range(len(text) - k + 1):
21
+ shingles.add(text[i:i + k])
22
+ return shingles
23
+
24
+ # Generate shingles for both documents
25
+ shingles1 = text_to_shingles(doc1)
26
+ shingles2 = text_to_shingles(doc2)
27
+
28
+ # Compute MinHash signatures
29
+ minhash1 = MinHash(num_perm=128)
30
+ minhash2 = MinHash(num_perm=128)
31
+
32
+ for shingle in shingles1:
33
+ minhash1.update(shingle.encode('utf8'))
34
+
35
+ for shingle in shingles2:
36
+ minhash2.update(shingle.encode('utf8'))
37
+
38
+ # Compute Jaccard similarity using MinHash
39
+ similarity_score = minhash1.jaccard(minhash2)
40
+ return similarity_score
41
+
42
+ # Function to handle the similarity calculation
43
+ def similarity(doc1, doc2, file1=None, file2=None):
44
+ text1 = ""
45
+ text2 = ""
46
+
47
+ # Check for file uploads
48
+ if file1 is not None and file1.name.endswith('.docx'):
49
+ text1 = extract_text_from_docx(file1.name)
50
+ elif doc1:
51
+ text1 = doc1
52
+ else:
53
+ return "Please provide either a DOCX file or paste the text for Document 1."
54
+
55
+ if file2 is not None and file2.name.endswith('.docx'):
56
+ text2 = extract_text_from_docx(file2.name)
57
+ elif doc2:
58
+ text2 = doc2
59
+ else:
60
+ return "Please provide either a DOCX file or paste the text for Document 2."
61
+
62
+ return calculate_similarity(text1, text2)
63
+
64
+ # Create a Gradio interface
65
+ with gr.Blocks() as demo:
66
+ gr.Markdown("## Document Similarity Checker")
67
+ with gr.Row():
68
+ with gr.Column():
69
+ file1 = gr.File(label="Upload Document 1 (DOCX)")
70
+ doc1 = gr.Textbox(label="Or Paste Text for Document 1", lines=10)
71
+ with gr.Column():
72
+ file2 = gr.File(label="Upload Document 2 (DOCX)")
73
+ doc2 = gr.Textbox(label="Or Paste Text for Document 2", lines=10)
74
+ output = gr.Textbox(label="Similarity Score")
75
+ submit = gr.Button("Submit")
76
+
77
+ submit.click(fn=similarity, inputs=[doc1, doc2, file1, file2], outputs=output)
78
+
79
+ # Launch the Gradio app
80
+ demo.launch()