File size: 38,426 Bytes
8fc25ec
 
 
 
 
 
5fa911d
42ef63e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fa911d
 
 
 
 
 
 
 
 
 
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc7ba9a
 
 
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3fe7e
8fc25ec
 
 
 
 
 
 
 
 
d31ea2d
8fc25ec
 
 
 
fc7ba9a
8fc25ec
 
fc7ba9a
8fc25ec
 
 
 
 
 
fc7ba9a
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3fe7e
8fc25ec
 
 
 
d31ea2d
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
42ef63e
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
096d2db
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
096d2db
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3fe7e
8fc25ec
 
 
 
 
 
 
d8630dd
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
5fa911d
8fc25ec
 
 
 
 
5f3fe7e
8fc25ec
 
 
 
 
 
 
 
 
 
22018bf
8fc25ec
 
 
 
 
d8630dd
8fc25ec
 
735a2f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3fe7e
735a2f1
 
 
 
 
 
 
 
d8630dd
735a2f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fc25ec
5fa911d
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3fe7e
27675c8
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7c2b0c
 
8fc25ec
5fa911d
8fc25ec
735a2f1
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
735a2f1
 
8fc25ec
 
 
 
 
 
 
 
 
 
 
735a2f1
 
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735a2f1
 
 
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735a2f1
 
 
 
8fc25ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a956e1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch 
from mosestokenizer import *
from indicnlp.tokenize import sentence_tokenize
from docx import Document

import os
import torch
import time
import json
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits


os.system('git clone https://github.com/TheAtticusProject/cuad.git')
os.system('mv cuad cuad-training')
os.system('unzip cuad-training/data.zip -d cuad-data/')
os.system('mkdir cuad-models')
os.system('curl https://zenodo.org/record/4599830/files/roberta-base.zip?download=1 --output cuad-models/roberta-base.zip')
os.system('unzip cuad-models/roberta-base.zip -d cuad-models/')



trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M" )
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trans_model = trans_model.to(device)



lang_dict = {
    'english' : 'eng_Latn',
    'assamese' : 'asm_Beng',
    'awadhi' : 'awa_Deva' ,
    'bengali' : 'ben_Beng',
    'bhojpuri' :	'bho_Deva',
    'gujarati' :	'guj_Gujr', 
    'hindi' : 'hin_Deva',
    'kannada' :	'kan_Knda',
    'kashmiri' :	'kas_Deva',
    'maithili' :	'mai_Deva',
    'malayalam' :	'mal_Mlym',
    'marathi' :	'mar_Deva',
    'odia' :	'ory_Orya',
    'punjabi' :	'pan_Guru',
    'sanskrit' :	'san_Deva',
    'sindhi' :	'snd_Arab' ,
    'tamil' :	'tam_Taml' ,
    'telugu' :	'tel_Telu',
    'urdu' :	'urd_Arab'
}

def translate_sentence(article, target):
    inputs = trans_tokenizer(article.replace("\"",""), return_tensors="pt").to(device)

    translated_tokens = trans_model.generate(
        **inputs, forced_bos_token_id=trans_tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)

    return trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]



INDIC_DICT = {"assamese" :"as",   'bengali' : 'bn', 'gujarati' :	'gu', 
    'hindi' : 'hi',
    'kannada' :	'kn',
    'malayalam' :	'ml',
    'marathi' :	'mr',
    'odia' :	'or',
    'punjabi' :	'pa',
    'tamil' :	'ta' ,
    'telugu' :	'te'}

def split_sentences(paragraph, language):
    if language in INDIC_DICT.keys():
        return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
    elif language == 'en':
      with MosesSentenceSplitter('en') as splitter:
            return splitter([paragraph])
    else:
      return paragraph.split(".")

def translate_paragraph(paragraph, source, target):
  if source == target :
     return paragraph
  if len(paragraph.split()) < 100:
    return translate_sentence(paragraph, target)
  else:
      sentences = split_sentences(paragraph, source)
      outputs = []
      for each_sentence in sentences:
        outputs.append(translate_sentence(each_sentence, target))
      return " ".join(outputs)

def docx_replace(doc, data):
    paragraphs = list(doc.paragraphs)
    for t in doc.tables:
        for row in t.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    paragraphs.append(paragraph)
    
    for each in data:
        key = list(each.keys())[0]
        val = list(each.values())[0]
        for p in paragraphs:
            #key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName}
            key_name = key
            if key_name in p.text:
                #print(f'old one {p.text}')
                inline = p.runs
                # Replace strings and retain the same style.
                # The text to be replaced can be split over several runs so
                # search through, identify which runs need to have text replaced
                # then replace the text in those identified
                started = False
                key_index = 0
                # found_runs is a list of (inline index, index of match, length of match)
                found_runs = list()
                found_all = False
                replace_done = False
                for i in range(len(inline)):

                    # case 1: found in single run so short circuit the replace
                    if key_name in inline[i].text and not started:
                        found_runs.append((i, inline[i].text.find(key_name), len(key_name)))
                        text = inline[i].text.replace(key_name, str(val))
                        inline[i].text = text
                        replace_done = True
                        found_all = True
                        break

                    if key_name[key_index] not in inline[i].text and not started:
                        # keep looking ...
                        continue

                    # case 2: search for partial text, find first run
                    if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started:
                        # check sequence
                        start_index = inline[i].text.find(key_name[key_index])
                        check_length = len(inline[i].text)
                        for text_index in range(start_index, check_length):
                            if inline[i].text[text_index] != key_name[key_index]:
                                # no match so must be false positive
                                break
                        if key_index == 0:
                            started = True
                        chars_found = check_length - start_index
                        key_index += chars_found
                        found_runs.append((i, start_index, chars_found))
                        if key_index != len(key_name):
                            continue
                        else:
                            # found all chars in key_name
                            found_all = True
                            break

                    # case 2: search for partial text, find subsequent run
                    if key_name[key_index] in inline[i].text and started and not found_all:
                        # check sequence
                        chars_found = 0
                        check_length = len(inline[i].text)
                        for text_index in range(0, check_length):
                            if inline[i].text[text_index] == key_name[key_index]:
                                key_index += 1
                                chars_found += 1
                            else:
                                break
                        # no match so must be end
                        found_runs.append((i, 0, chars_found))
                        if key_index == len(key_name):
                            found_all = True
                            break

                if found_all and not replace_done:
                    for i, item in enumerate(found_runs):
                        index, start, length = [t for t in item]
                        if i == 0:
                            text = inline[index].text.replace(inline[index].text[start:start + length], str(val))
                            inline[index].text = text
                        else:
                            text = inline[index].text.replace(inline[index].text[start:start + length], '')
                            inline[index].text = text
                #print(p.text)
                break

input_output_trans = {}


def translate_fill(document_name,output_file, src, trg):
        print("translate doc")

        doc = docx.Document(document_name)
        if doc.paragraphs[0].text in list(input_output_trans.keys()):
            lang_doc_dict = input_output_trans[doc.paragraphs[0].text]
            if trg in lang_doc_dict.keys():
                time.sleep(2)
                return lang_doc_dict[trg]

        template_document = Document(document_name)

        variables = []
        for paragraph in template_document.paragraphs:
          if(paragraph.text.strip() != ""):
            variables.append({paragraph.text : translate_paragraph(paragraph.text, src, trg)})
            
        for t in template_document.tables:
                for row in t.rows:
                    for cell in row.cells:
                        for paragraph in cell.paragraphs:
                          if(paragraph.text.strip() != ""):
                            variables.append({paragraph.text : translate_paragraph(paragraph.text, src, trg)})
                          
        docx_replace(template_document, variables)
        template_document.save(output_file)
        return output_file



def translate_txt(document_name, output_file, src, trg):
      print("translate text")
      with open(document_name) as fp:
          lines = fp.readlines()
      
      lines = [line.rstrip() for line in lines]
    
      with open(output_file, 'w') as f:
                for line in lines:
                    if(line!=""):
                        f.write( translate_paragraph(line, src, trg) + "\n")
                    else:
                        f.write("\n")
        
      return output_file
      

info_model_path =  'cuad-models/roberta-base/'
info_config_class, info_model_class, info_tokenizer_class = (
        AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
info_config = info_config_class.from_pretrained(info_model_path)
info_tokenizer = info_tokenizer_class.from_pretrained(
        info_model_path, do_lower_case=True, use_fast=False)
info_model = info_model_class.from_pretrained(info_model_path, config=info_config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
info_model.to(device)

def run_prediction(question_texts, context_text):
    ### Setting hyperparameters
    max_seq_length = 512
    doc_stride = 256
    n_best_size = 1
    max_query_length = 64
    max_answer_length = 512
    do_lower_case = False
    null_score_diff_threshold = 0.0

    # model_name_or_path = "../cuad-models/roberta-base/"

    def to_list(tensor):
        return tensor.detach().cpu().tolist()

    processor = SquadV2Processor()
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer= info_tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )
    
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
       
    all_results = []

    for batch in eval_dataloader:
        info_model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = info_model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs.to_tuple()]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    final_predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=all_results,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        do_lower_case=do_lower_case,
        output_prediction_file=None,
        output_nbest_file=None,
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=null_score_diff_threshold,
        tokenizer=info_tokenizer
    )

    return final_predictions 


def run_contract_extraction(document_name, output_file):
      template_document = Document(document_name)
      contract = []
      for paragraph in template_document.paragraphs:
            if(paragraph.text.strip()!=''):
              contract.append(paragraph.text)

      contract = "\n".join(contract)
      questions = []

      with open('./cuad-data/CUADv1.json') as json_file:
        data = json.load(json_file)

      #with open('./cuad-data/questions.txt', 'w') as questions_file:
      for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
              question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
              questions.append(question)
             
      predictions = run_prediction(questions, contract)

      with open(output_file, 'w') as f:
          count = 1
          for i, p in enumerate(predictions):
              if(predictions[p]!=''):
                #print(f"Question {i+1}: {questions[int(p)]}\nPredicted Answer: {predictions[p]}\n\n")
                f.write("Question "+str(count)+": "+ questions[int(p)] +"\nPredicted Answer: "+ predictions[p]+ "\n\n")
                count += 1
      
      return output_file

input_output_key = {}

def run_key_clause(document_name, output_name,source_language):
  doc = docx.Document(document_name)
  if doc.paragraphs[0].text in list(input_output_key.keys()):
      time.sleep(2)
      return input_output_key[doc.paragraphs[0].text]

  if source_language != 'english':
     translation_output = translate_fill(document_name, "info_translation.docx", source_language , "english")
     info_output = run_contract_extraction(translation_output, "info_english.txt")
     final_info = translate_txt(info_output, output_name, "english",source_language)
              
  else:
     final_info = run_contract_extraction(document_name, output_name)

  return final_info
  
  
from transformers import AutoModelWithLMHead, AutoTokenizer
from docx import Document

qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
qg_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qg_model.to(device)

def get_question(answer, context, max_length=64):
  input_text = "answer: %s  context: %s </s>" % (answer, context)
  features = qg_tokenizer([input_text], return_tensors='pt').to(device)

  output = qg_model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)

  return qg_tokenizer.decode(output[0])


def run_fill_questions(document_name, output_file, questions_file, delimiter):
      print("QGenerations")
      prev_para = ''
      count = 0
      variables = []
      questions = []

      doc = Document(document_name)

      for paragraph in doc.paragraphs:
              if(paragraph.text.strip()==''):
                    continue
              if(paragraph.text.count(delimiter)>0):
                      var_count = paragraph.text.count(delimiter)
                      format_str = paragraph.text.replace(delimiter, '{}')
                      new_string = format_str.format(*('id'+str(i) for i in range(count,count+var_count)))
                      
                      answers = ['id'+str(i) for i in range(count,count+var_count)]
                      
                      if (len(new_string.split())<10):
                          context = prev_para + " " + new_string
                      else:
                          context = new_string

                          
                      for answer in answers:
                                question_string = get_question(answer, context).replace('<pad> question:','').replace('</s>','').strip()
                                question = "{{"+question_string+"}}"
                                questions.append(question_string)
                                new_string = new_string.replace(answer, question)
                      
                      count += var_count
                      variables.append({paragraph.text : new_string })

              prev_para = paragraph.text
      
      with open(questions_file, 'w') as f:
          count = 1
          for p in questions:
                f.write("Question "+str(count)+": "+ p +"\n")
                count += 1

      
      docx_replace(doc, variables)
      doc.save(output_file)
      return output_file, questions_file


def extract_questions(document_name, output_file):
  questions = []
  doc = Document(document_name)
 
  for paragraph in doc.paragraphs:
        if(paragraph.text.strip()==''):
                    continue
        else:     
          q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip())
          questions.extend(q)
                  

  with open(output_file, 'w') as f:
          count = 1
          for p in questions:
                f.write("Question "+str(count)+": "+ p +"\n")
                count += 1
  
  return output_file

input_output_qg = {}


def run_generate_questions(document_name, output_file, questions_file, delimiter, source_language):
  doc = docx.Document(document_name)
  if doc.paragraphs[0].text in list(input_output_qg.keys()):
      qg_output =  input_output_qg[doc.paragraphs[0].text]
      q_output = extract_questions(qg_output, questions_file)
      time.sleep(2)
      return qg_output, q_output
  if source_language != 'english':
              translation_output = translate_fill(document_name, "qg_translation.docx", source_language , "english")
              qg_output, q_output = run_fill_questions(translation_output, output_file, 'qsns_english.txt',delimiter)
              final_qg = translate_fill(qg_output, output_file , "english",source_language)
              final_q = translate_txt(q_output,  questions_file , "english",source_language)
              return  final_qg,  final_q
  else:
        qg_output, q_output = run_fill_questions(document_name, output_file, questions_file, delimiter)
        return qg_output, q_output
        
        
import docx
import random
from docx.shared import RGBColor
import time 
import re

input_output_red = {}

def run_redflags(filename, output_file):
    print("Red flags")
    doc = docx.Document(filename)
    if doc.paragraphs[0].text in list(input_output_red.keys()):
      return input_output_red[doc.paragraphs[0].text]
    else:   
        for para in doc.paragraphs:
            inline = para.runs
            colour = False
            if (len(para.text.split())>10) and random.random()>0.8:
                colour = True
            if colour:
              for i in range(len(inline)):
                  inline[i].font.color.rgb = RGBColor(255, 000, 000)
        
        time.sleep(2)
        doc.save(output_file)
        return output_file 

     
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
from docx import Document
from collections import Counter

rc_tokenizer = AutoTokenizer.from_pretrained("tuner007/t5_abs_qa")
rc_model = AutoModelWithLMHead.from_pretrained("tuner007/t5_abs_qa")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rc_model = rc_model.to(device)

def get_answer(question, context):
      input_text = "context: %s <question for context: %s </s>" % (context,question)
      features = rc_tokenizer([input_text], return_tensors='pt')
      out = rc_model.generate(input_ids=features['input_ids'].to(device), attention_mask=features['attention_mask'].to(device))
      return rc_tokenizer.decode(out[0])

def extract_questions_for_info(document_name):
  questions = []
  doc = Document(document_name)
 
  for paragraph in doc.paragraphs:
        if(paragraph.text.strip()==''):
                    continue
        else:     
          q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip())
          questions.extend(q)
  return questions 


def extract_info(questions, context):
        variables = []
        unanswered = []
        max_length = 512 # The maximum length of a feature (question and context)
        doc_stride = 256


        for question in questions:
                tokenized_example = rc_tokenizer(
                    str(question),
                    str(context.replace('\'','').replace('"',"")),
                    max_length=max_length,
                    truncation="only_second",
                    return_overflowing_tokens=True,
                    stride=doc_stride)
                answers = []
                for x in tokenized_example["input_ids"]:
                      q, c = rc_tokenizer.decode(x).split("</s>")[0], rc_tokenizer.decode(x).split("</s>")[1]
                      answers.append(get_answer(q, c).replace('<pad>','').replace('</s>','').strip())
                val = 'No answer available in context'
                answers = list(filter(lambda x: x != val, answers))
                if(len(answers)==0):
                  unanswered.append(question)
                else: 
                  fre_list = Counter(answers)
                  answer = fre_list.most_common(1)[0][0]
                  variables.append({"{{"+question+"}}" : answer})
        return variables, unanswered

input_output_exin = {}

def run_extract_info(document_name, context, output_file, source_language):
  print("Extract")
  doc = docx.Document(document_name)

  if doc.paragraphs[0].text in list(input_output_exin.keys()):
      exin_output =  input_output_exin[doc.paragraphs[0].text]
      exin_unanswered = extract_questions_for_info(exin_output)
      time.sleep(2)
      return exin_output, exin_unanswered
  else:
      if source_language != 'english':
              translation_output = translate_fill(document_name, "exin_translation.docx", source_language , "english")
              questions = extract_questions_for_info(translation_output )
              context = translate_paragraph(context)

              variables, unanswered = extract_info(questions, context)
              template_document = Document(document_name)
              docx_replace(template_document, variables)
              template_document.save("exin_modified.docx")

              final_exin = translate_fill("exin_modified.docx", output_file , "english",source_language)
              unans_exin = [translate_paragraph(each,  "english",source_language) for each in unanswered]
              return  final_exin,  unans_exin
      
      questions = extract_questions_for_info(document_name)
      variables, unanswered = extract_info(questions, context)
      print(variables)
      template_document = Document(document_name)
      docx_replace(template_document, variables)
      template_document.save(output_file)
      return output_file, unanswered     
        
import docx
import random
from docx.shared import RGBColor
import time 
import re
from docx import Document

from docx.enum.text import WD_COLOR_INDEX

from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


similar_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert' )
similar_model = AutoModel.from_pretrained('ai4bharat/indic-bert' )
similar_model.eval()

def obtain_rep(documents):   
     # initialize dictionary to store tokenized sentences
      mean_pooled = []
      with torch.no_grad():
          for sentence in documents:
              # encode each sentence and append to dictionary
              tokens = {'input_ids': [], 'attention_mask': []}

              new_tokens = similar_tokenizer.encode_plus(sentence, max_length=128,
                                                truncation=True, padding='max_length',
                                                return_tensors='pt')
              tokens['input_ids'].append(new_tokens['input_ids'][0])
              tokens['attention_mask'].append(new_tokens['attention_mask'][0])
              tokens['input_ids'] = torch.stack(tokens['input_ids'])
              tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
              
              outputs = similar_model(**tokens)
              mean_pooled.append(outputs.pooler_output)
      
      return torch.stack(mean_pooled).squeeze(1)

def similarity(documents, clauses):    
      clauses = clauses.detach().numpy()
      documents = documents.detach().numpy()
      sim = cosine_similarity(clauses,documents)
      max_sim = np.max(sim, axis=0)
      return max_sim

def fill_yellow(filename, output_file, highlighted_paras):
     doc = docx.Document(filename)
     for each in highlighted_paras:
          for para in doc.paragraphs:
                  inline = para.runs
                  colour = False
                  if  each in para.text:
                      colour = True
                  if colour:
                    for i in range(len(inline)):
                        inline[i].font.highlight_color = WD_COLOR_INDEX.YELLOW 
                    break
     doc.save(output_file)
     return output_file 
      

def get_similar_clauses(filename, output_file,clauses, source_language):
      paras = []
      template_document = Document(filename)
      contract = []
      for paragraph in template_document.paragraphs:
            if(paragraph.text.strip()!=''):
              contract.append(paragraph.text)

      sentence_batch = []

      for paragraph in contract:
        sentence_batch.extend(split_sentences(paragraph, source_language))

      sentence_batch = [each for each in sentence_batch if each!=' ' and len(each.split())>5]

      doc_rep = obtain_rep(sentence_batch)
      clause_rep = obtain_rep(clauses)
      k = similarity(doc_rep, clause_rep)
      pick_top = max(int(0.1*len(sentence_batch)),3)
      ind = k.argsort()[-pick_top:][::-1]
      for each_idx in ind:
          paras.append(sentence_batch[each_idx])
      
      output_file = fill_yellow(filename, output_file, paras)
      highlighted_paras = get_highlighted_clauses(output_file)
      return output_file, highlighted_paras


input_output_similar = {}

def get_highlighted_clauses(filename):
   doc = docx.Document(filename)
   para_highlighted = []
   for para in doc.paragraphs:
            inline = para.runs
            colour = False
            for i in range(len(inline)):
                if inline[i].font.highlight_color == WD_COLOR_INDEX.YELLOW :
                  colour = True
                  break
            if colour:
              para_highlighted.append(para.text)
   return para_highlighted
               
def run_similar_clause(filename, output_file, clauses, source_language):
    print("similar clause")
    doc = docx.Document(filename)
    for doc_input in list(input_output_similar.keys()):
        if doc.paragraphs[0].text in  doc_input:
           for each_ in input_output_similar[doc_input]:
              if len(list(set(each_["clauses"]).intersection(set(clauses))))>0 :
                 output_file =  each_["file"]
           time.sleep(3)
           highlighted_paras = get_highlighted_clauses(output_file)
           return output_file, highlighted_paras
    output_file, highlighted_paras = get_similar_clauses(filename, output_file,clauses, source_language)
    return output_file, highlighted_paras
    
import gradio as gr

analysis_services = ['Translate Contract', 'Identify key Clauses', 'Red flag Identification', 'Similar Semantic Clause search', 'Generate Questions for Contract Template', 'Fill Contract Template by extracting information']
analysis_label = 'Select Contract Analysis Service'
analysis_choices = analysis_services
analysis_choice = ''
lang_choice = 'english'
translation_label = 'Upload contract for Translation'
translation_src_label = 'Select language of uploaded contract'
translation_tgt_label = 'Select language to translate'
keyclause_label = 'Upload contract for Key Clause Extraction'
redflag_label = 'Upload contract for Red Flag Identification'
similar_label = 'Upload contract for Semantic Similar Clauses'
similar_clause_label = 'Enter clauses to be identified (enter one clause per line)'
generate_questions_label = 'Upload template contract for Question Generation'
rc_file_label = 'Upload template contract with questions to fill'
rc_context_label = 'Enter the text to extract answer from'
delimiter_label = "Input placeholder (pattern or symbol used as blank in template)"
button_label = "Upload and Analyze"


translation_output_label = 'Download your translated contract'
keyclause_output_label = 'Download your key clauses from the contract'
redflag_output_label = 'Download your contract with red flags highlighted'
similar_file_label = 'Download your contract with  highlighted similar clauses in yellow'
similar_text_label = 'A quick view of similar clauses'
qg_output_label = 'Download your template contract along with questions'
q_output_label = 'Download only questions to fill the template contract'
rc_output_label = 'Download your template contract along with filled answers'
rc_text_label = 'Unanswered Questions'

def change_analysis(choice):
    global lang_choice, analysis_choices
    lang_choice = choice 
    analysis_choices = [translate_paragraph(paragraph, "english", choice) for paragraph in analysis_services]
    return [gr.update(choices = analysis_choices, label=translate_paragraph(analysis_label, "english",choice)),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False)]

def change_inputs(choice):
    global analysis_choice
    analysis_choice = choice
    if analysis_choice == analysis_choices[0]:
          return [gr.update(visible=True, label = translate_paragraph(translation_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True, label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_tgt_label, "english",lang_choice)),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
    elif analysis_choice == analysis_choices[1]:
          return [gr.update(visible=True, label = translate_paragraph(keyclause_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
    elif analysis_choice == analysis_choices[2]:
          return [gr.update(visible=True, label = translate_paragraph(redflag_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
    elif analysis_choice == analysis_choices[3]:
          return [gr.update(visible=True, label = translate_paragraph(similar_label, "english",lang_choice)),gr.update(visible=True, label = translate_paragraph(similar_clause_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]                                                                                                                                                                                                                                                                                                 
    elif analysis_choice == analysis_choices[4]:
          return [gr.update(visible=True, label = translate_paragraph(generate_questions_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=True, label= translate_paragraph(delimiter_label,"english",lang_choice)), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
    elif analysis_choice == analysis_choices[5]:
          return [gr.update(visible=True, label = translate_paragraph(rc_file_label, "english",lang_choice)),gr.update(visible=True, lines = 16, label = translate_paragraph(rc_context_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]                                                                                                                                                                                                                                                                                                 
    
def process_analysis(document_name, text, source_language, target_language, delimiter):
    if analysis_choice == analysis_choices[0]:
          translation_output = translate_fill(document_name, "translation_" + target_language + ".docx", source_language , target_language)
          return [gr.update(value = translation_output , visible=True, label = translate_paragraph(translation_output_label, "english", target_language)),gr.update(visible=False),gr.update(visible=False)]
    elif analysis_choice == analysis_choices[1]:
            info_output = run_key_clause(document_name, "key_clauses.txt",source_language)
            return [gr.update(value = info_output, visible=True, label = translate_paragraph(keyclause_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
    elif analysis_choice == analysis_choices[2]:
          red_flag_output = run_redflags(document_name, "redflag.docx")
          return [gr.update(value =  red_flag_output,visible=True, label = translate_paragraph(redflag_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
    elif analysis_choice == analysis_choices[3]:
          clauses = text.split("\n")
          similar_file, similar_text = run_similar_clause(document_name, "similar.docx", clauses, source_language)
          similar_text = "\n\n\n".join(similar_text)
          return [gr.update(value = similar_file, visible=True, label = translate_paragraph(similar_file_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = similar_text, visible=True, label = translate_paragraph(similar_text_label, "english",lang_choice))]
    elif analysis_choice == analysis_choices[4]:
          qg_output, q_output = run_generate_questions(document_name, "qsns_template.docx", "qsns_only.txt", delimiter, source_language)
          return [gr.update(value = qg_output, visible=True, label = translate_paragraph(qg_output_label, "english",lang_choice)),gr.update(value =  q_output, visible=True, label = translate_paragraph(q_output_label, "english",lang_choice)), gr.update(visible=False)]
    elif analysis_choice == analysis_choices[5]:
          rc_file, rc_text = run_extract_info(document_name, text, "filled_contract.docx", source_language)
          rc_text = "\n\n".join(rc_text)
          return [gr.update(value = rc_file, visible=True, label = translate_paragraph(rc_output_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = rc_text, visible=True, label = translate_paragraph(rc_text_label, "english",lang_choice))]
    

with gr.Blocks() as demo:
    lang_radio = gr.Radio(list(lang_dict.keys()), value = 'english', label="Select your language")
    analysis_radio = gr.Radio(analysis_services  , label=analysis_label)

    with gr.Row():
        input_file = gr.File(interactive = True, visible = False)
        with gr.Column():
          translation_source = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_src_label, visible=False)
          translation_target = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_tgt_label, visible=False)
          delimiter = gr.Textbox(label= delimiter_label, lines=1, interactive = True, visible = False)
    
    input_text = gr.Textbox(lines=4, interactive = True, visible = False)

    button = gr.Button(value = button_label , visible = False)
    output_file = gr.File(interactive = False, visible = False)
    output_file2 = gr.File(interactive = False, visible = False)
    output_text = gr.Textbox(interactive = False, visible = False)
    
    lang_radio.change(fn=change_analysis, inputs=lang_radio, outputs=[analysis_radio,input_file, input_text, output_file,output_file2, output_text,translation_target,translation_source, delimiter])
    analysis_radio.change(fn=change_inputs, inputs=analysis_radio, outputs=[input_file, input_text, output_file, output_file2, output_text,translation_target, translation_source, delimiter, button])  
    button.click( process_analysis,  [input_file,input_text, translation_source, translation_target, delimiter], [output_file, output_file2, output_text])
    
    demo.launch(debug=True)