diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8276d0e04cb144608ba3d64942b7225bb82b6d92 Binary files /dev/null and b/__pycache__/config.cpython-310.pyc differ diff --git a/app.py b/app.py index ced732ee8b6e822b613fc465a4b2eb255df41544..b6867f101d3aff2d4a630409bfebd4f541570938 100644 --- a/app.py +++ b/app.py @@ -1,91 +1,33 @@ -import gradio as gr +import pandas as pd +import os +from langchain.llms import OpenAI +import chromadb +from config import * +from src.control.control import Controller +from src.tools.retriever import Retriever +from src.tools.llm import LlmAgent +from src.model.doc import Doc +import src.view.view as view -import src.control.control as ctrl +os.environ["OPENAI_API_KEY"] = OpenAI_KEY +os.environ["TOKENIZERS_PARALLELISM"] = "true" +doc_content = Doc(content_en_path) +doc_plan = Doc(plan_path) +doc_content_fr = Doc(content_fr_path) -""" -================================== -A. Component part -================================== -""" +client_db = chromadb.Client() +retriever = Retriever(client_db, doc_plan, doc_content, doc_content_fr, collection_name) -with gr.Blocks() as hrqa: +llm_model = OpenAI(temperature=0) +llm = LlmAgent(llm_model) - with gr.Row(): +specials['remote_rate_df'] = pd.read_csv(specials['remote_rate_path']) +specials['accommodation_meal_df'] = pd.read_csv(specials['accommodation_meal_path']) +controller = Controller(retriever=retriever, llm=llm, content_language=content_language, plan_language=plan_language, + specials=specials) - with gr.Column(): - pass +qna = view.run(ctrl=controller, examples=examples) - with gr.Column(scale=10): - """ - 1. input docs components - """ - - gr.Markdown("# Questions sur le vivre ensemble en entreprise") - - input_text_comp = gr.Textbox( - label="", - lines=1, - max_lines=3, - interactive=True, - placeholder="Posez votre question ici", - ) - input_example_comp = gr.Radio( - label="Examples de questions", - choices=["Remboursement de frais de voiture", "Recommandations de transport"], - ) - output_text_comp = gr.Textbox( - label="La réponse automatique", - lines=2, - max_lines=10, - interactive=False, - visible=False, - ) - sources_comp = gr.CheckboxGroup( - label="Documents sources", - visible=False, - interactive=False, - ) - - with gr.Column(): - pass - - - def input_text_fn1(): - update_ = { - output_text_comp: gr.update(visible=True), - } - return update_ - - def input_text_fn2(input_text_): - answer, sources = ctrl.get_response(query=input_text_) - source_labels = [s['distance']+' '+s['paragraph']+' '+s['title']+' from '+s['doc'] for s in sources] - update_ = { - output_text_comp: gr.update(value=answer), - sources_comp: gr.update(visible=True, choices=source_labels, value=source_labels) - } - return update_ - - - def input_example_fn(input_example_): - examples = { - "Remboursement de frais de voiture": "Comment sont remboursés mes frais kilométriques sur mes trajets " - "professionnels?", - "Recommandations de transport": "Quelles sont les recommandations de l'entreprise? Vaut-il mieux voyager en " - "train ou en avion?" - } - update_ = { - input_text_comp: gr.update(value=examples[input_example_]), - output_text_comp: gr.update(visible=True), - } - return update_ - - input_text_comp\ - .submit(input_text_fn1, inputs=[], outputs=[output_text_comp])\ - .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp]) - input_example_comp\ - .change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp])\ - .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp]) - -hrqa.queue().launch() \ No newline at end of file +qna.queue().launch() diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..3035b6f88d8eff63488ae309a9a88a8533164f51 --- /dev/null +++ b/config.py @@ -0,0 +1,34 @@ +plan_language = 'en' +content_language = 'en' +plan_path = 'data/business_trips_plan_until_end_en.docx' +content_en_path = 'data/business_trips_content_until_end_en.docx' +content_fr_path = 'data/business_trips_content_until_end_fr.docx' + +collection_name = "until_end" +OpenAI_KEY = "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" +examples = { + "Remboursement de frais de voiture": "Comment sont remboursés mes frais kilométriques sur mes trajets " + "professionnels?", + "Recommandations de transport": "Quelles sont les recommandations de l'entreprise? " + "Vaut-il mieux voyager en train ou en avion?", + "Indemnités pour des séjours longs à l'étranger": "Y a-t-il des indemnités pour des séjours longs à l'étranger?", + "Indemnités pour des séjours longs en Bolivie": "Y a-t-il des indemnités pour des séjours longs en Bolivie?", + "Indemnités pour les repas aux Pays-Bas": "Quelles sont les indemnités pour les repas au Pays-Bas?" +} +countries_extensions = { + 'Royaume-Uni': ['UK', 'U.K.','RU', 'R.U.', 'Angleterre'], + 'Etats-Unis': ['Etats-unis', 'Etats Unis', 'Etats unis', 'ETATS-UNIS', 'USA'], + 'E.A.U': ["EAU", "Emirats", "Emirats Arabes Unis", "Emirates", "UAE", "United Arab Emirates"], + 'Pays-Bas': ['Les Pays-Bas'] +} +specials = {'remote_rate_path': 'data/remote_rates.csv', + 'remote_rate_known': "the scale rate of remoteness for the ", + + 'remote_rate_unknown': "the scale rate of remoteness for the country mentionned is unknown. Allowances " + "apply though", + 'accommodation_meal_path': 'data/accommodation_meal_rates.csv', + 'accommodation_meal_known': 'the rates for accommodation and meals are the following: ', + 'accommodation_meal_unknown': 'the rates for accommodation and meals are not defined for the country ' + 'mentionned ', + 'countries_extensions': countries_extensions, + } diff --git a/data/.DS_Store b/data/.DS_Store index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..284b02617e8e0e4b30c2ce0488815675e6dbd0e5 100644 Binary files a/data/.DS_Store and b/data/.DS_Store differ diff --git a/data/AccomodationAndMealsForfaits_en.csv b/data/AccomodationAndMealsForfaits_en.csv deleted file mode 100644 index e9499ec401c3eb01e2c72798a42621745a805c5e..0000000000000000000000000000000000000000 Binary files a/data/AccomodationAndMealsForfaits_en.csv and /dev/null differ diff --git a/data/AccomodationAndMealsForfaits_en.numbers b/data/AccomodationAndMealsForfaits_en.numbers deleted file mode 100755 index dc1284a45b900a6b1671ed70d588c363b2b521fb..0000000000000000000000000000000000000000 Binary files a/data/AccomodationAndMealsForfaits_en.numbers and /dev/null differ diff --git a/data/AccomodationAndMealsForfaits_fr.csv b/data/AccomodationAndMealsForfaits_fr.csv deleted file mode 100644 index 8b71a64f48ab37711f5a9f8ec7d2867f5c0d9d35..0000000000000000000000000000000000000000 --- a/data/AccomodationAndMealsForfaits_fr.csv +++ /dev/null @@ -1,31 +0,0 @@ -Destination;Hebergement;Repas -France;125;27 -Allemagne;150;35 -Arabie Saoudite;200;40 -Autriche;110;40 -Belgique;150;35 -Canada;150;30 -Chine;113;37 -Egypte;150;25 -Emirats Arabes Unis;160;46 -Espagne;130;30 -Etats-Unis;140;47 -Grce;140;25 -Inde;160;47 -Irlande;180;30 -Italie;120;37 -Japon;150;25 -Maroc;110;25 -Mexique;130;27 -Norvge;160;40 -Pays-Bas;150;32 -Pologne;110;23 -Portugal;108;25 -Qatar;210;35 -Royaume-Uni;130;28 -Russie;180;50 -Singapour;170;42 -Sude;90;30 -Suisse;192;35 -Taiwan;123;37 -Turquie;150;28 \ No newline at end of file diff --git a/data/AccomodationAndMealsForfaits_fr.numbers b/data/AccomodationAndMealsForfaits_fr.numbers deleted file mode 100755 index 5269dd5685195ad2731beccaf11b68ec1612fd36..0000000000000000000000000000000000000000 Binary files a/data/AccomodationAndMealsForfaits_fr.numbers and /dev/null differ diff --git a/data/BaremeTauxEloignement.csv b/data/BaremeTauxEloignement.csv deleted file mode 100644 index e03e92d3f006acf8ebf53734650d76dbddf81b65..0000000000000000000000000000000000000000 --- a/data/BaremeTauxEloignement.csv +++ /dev/null @@ -1,84 +0,0 @@ -Tableau 1 -Barème Taux d’Éloignement; -Afrique du sud;10 % -Algérie;15 % -Allemagne;0 % -Arabie saoudite;12 % -Argentine;11 % -Australie;3 % -Autriche;0 % -Belgique;0 % -Bolivie;11 % -Brésil;11 % -Bulgarie;10 % -Cameroun;13 % -Canada;3 % -Chili;9 % -Chine;13 % -Chypre;4 % -Colombie;13 % -Corée;11 % -Croatie;7 % -Danemark;0 % -Djibouti;13 % -E.A.U;9 % -Egypte;16 % -Equateur;12 % -Espagne;0 % -Estonie;7 % -Etats unis;3 % -Ethiopie;12 % -Finlande;0 % -Grande Bretagne;0 % -Grèce;0 % -Guadeloupe;3 % -Guyane;7 % -Hong Kong;8 % -Hongrie;6 % -Ile Maurice;8 % -Inde;15 % -Indonésie;17 % -Irlande;0 % -Israël;9 % -Italie;0 % -Japon;8 % -Jordanie;10 % -Kenya;13 % -Koweït;11 % -Laos;13 % -Luxembourg;0 % -Madagascar;13 % -Malaisie;14 % -Maroc;8 % -Martinique;3 % -Mauritanie;10 % -Mexique;12 % -Mozambique;14 % -Nigeria;17 % -Norvège;0 % -Nouvelle Calédonie;4 % -Pakistan;17 % -Pérou;13 % -Philippines;16 % -Pologne;8 % -Polynésie;5 % -Portugal;0 % -Qatar;9 % -République Congo;14 % -République tchèque;6 % -Roumanie;11 % -Russie;13 % -Sénégal;10 % -Serbie;11 % -Singapour;6 % -Slovaquie;6 % -Sri Lanka;15 % -Suède;0 % -Suisse;0 % -Taiwan;11 % -Thaïlande;12 % -Tunisie;7 % -Turquie;10 % -Ukraine;12 % -Venezuela;13 % -Vietnam;13 % \ No newline at end of file diff --git a/data/ForfaitsRemboursements.csv b/data/ForfaitsRemboursements.csv deleted file mode 100644 index 921485d633c352ab252f70c9ea8954c384e1fca3..0000000000000000000000000000000000000000 --- a/data/ForfaitsRemboursements.csv +++ /dev/null @@ -1,31 +0,0 @@ -Destination;;Hébergement;;Repas -France;;IDF 125€ / Province 100€;;27 € -Allemagne;;150 € ;;35 € -Arabie Saoudite;;200 € ;;40 € -Autriche;;110 € ;;40 € -Belgique;;150 € ;;35 € -Canada;;150 € ;;30 € -Chine;;113 € ;;37 € -Egypte;;150 € ;;25 € -Emirats Arabes Unis;;160 € ;;46 € -Espagne;;130 € ;;30 € -Etats-Unis;;140 € ;;47 € -Grèce;;140 € ;;25 € -Inde;;160 € ;;47 € -Irlande;;180 € ;;30 € -Italie;;120 € ;;37 € -Japon;;150 € ;;25 € -Maroc;;110 € ;;25 € -Mexique;;130 € ;;27 € -Norvège;;160 € ;;40 € -Pays-Bas;;150 € ;;32 € -Pologne;;110 € ;;23 € -Portugal;;108 € ;;25 € -Qatar;;210 € ;;35 € -Royaume-Uni;;130 € ;;28 € -Russie;;180 € ;;50 € -Singapour;;170 € ;;42 € -Suède;;90 € ;;30 € -Suisse;;192 € ;;35 € -Taiwan;;123 € ;;37 € -Turquie;;150 € ;;28 € \ No newline at end of file diff --git a/data/NonPrisEnCharge.csv b/data/NonPrisEnCharge.csv deleted file mode 100644 index a1f8f4f34a2c7ac4dc7a71e3c069c02d05208fd6..0000000000000000000000000000000000000000 --- a/data/NonPrisEnCharge.csv +++ /dev/null @@ -1,14 +0,0 @@ -Non pris en charge via Note de Frais -"Matériel informatique : téléphone, chargeur, tablette, adaptateur prise, etc." -"Outillage : balai, tournevis, disque de disqueuse, etc." -"Mobilier/Aménagement de bureau : plantes, dalles, poufs, etc." -"Fournitures de bureau : café, piles, etc." -Conférence/Cotisation -Séminaire/Réunion Team Building -Doublon de clés -Equipement de Protection Individuelle (EPI) -Achat de bagagerie : neuf/perdu/endommagé -Lavage et recharge Carte de Lavage : tous types de véhicule -Consommation alcoolisée -"Collation : confiserie, gâteau, boisson, etc." -"Prestation de loisirs : Spa, piscine, massage, remontée mécanique, escape game, etc." \ No newline at end of file diff --git a/data/accommodation_meal_rates.csv b/data/accommodation_meal_rates.csv new file mode 100644 index 0000000000000000000000000000000000000000..df25fd62dc24e0fa3d64b5748d84782d5ec0da30 --- /dev/null +++ b/data/accommodation_meal_rates.csv @@ -0,0 +1,31 @@ +country,pays,accommodation,meal +France,France,Ile de France 125€ / Province 100€,27€ +Germany,Allemagne,150€,35€ +Saudi Arabia,Arabie Saoudite,200€,40€ +Austria,Autriche,110€,40€ +Belgium,Belgique,150€,35€ +Canada,Canada,150€,30€ +China,Chine,113€,37€ +Egypt,Egypte,150€,25€ +United Arab Emirates,Emirats Arabes Unis,160€,46€ +Spain,Espagne,130€,30€ +United States,Etats-Unis,140€,47€ +Greece,Grèce,140€,25€ +India,Inde,160€,47€ +Ireland,Irlande,180€,30€ +Italy,Italie,120€,37€ +Japan,Japon,150€,25€ +Morocco,Maroc,110€,25€ +Mexico,Mexique,130€,27€ +Norway,Norvège,160€,40€ +The Netherlands,Pays-Bas,150€,32€ +Poland,Pologne,110€,23€ +Portugal,Portugal,108€,25€ +Qatar,Qatar,210€,35€ +United Kingdom,Royaume-Uni,130€,28€ +Russia,Russie,180€,50€ +Singapore,Singapour,170€,42€ +Sweden,Suède,90€,30€ +Swiss,Suisse,192€,35€ +taiwan,Taïwan,123€,37€ +Türkiye,Turquie,150€,28€ \ No newline at end of file diff --git a/data/business_trips_content_en.docx b/data/business_trips_content_en.docx deleted file mode 100644 index c4e026068990cec5ffbff745b26d46d8fcb8c8bf..0000000000000000000000000000000000000000 Binary files a/data/business_trips_content_en.docx and /dev/null differ diff --git a/data/business_trips_content_fr.docx b/data/business_trips_content_fr.docx deleted file mode 100644 index 68da3f932c463366391858706e7bffe5c5de4b64..0000000000000000000000000000000000000000 Binary files a/data/business_trips_content_fr.docx and /dev/null differ diff --git a/data/business_trips_content_until_3_en.docx b/data/business_trips_content_until_3_en.docx deleted file mode 100644 index ff314c0a40d3875042473fd7a3405533f32a9dff..0000000000000000000000000000000000000000 Binary files a/data/business_trips_content_until_3_en.docx and /dev/null differ diff --git a/data/business_trips_content_until_3_enfr.docx b/data/business_trips_content_until_3_enfr.docx deleted file mode 100644 index 849b7f3f3655e252f1d712f2d7a68e215e36aa5a..0000000000000000000000000000000000000000 Binary files a/data/business_trips_content_until_3_enfr.docx and /dev/null differ diff --git a/data/business_trips_content_until_3_fr.docx b/data/business_trips_content_until_3_fr.docx deleted file mode 100644 index 4c4c5e99f7f59973133a7bcec232492defa8382a..0000000000000000000000000000000000000000 Binary files a/data/business_trips_content_until_3_fr.docx and /dev/null differ diff --git a/data/business_trips_content_until_9_en.docx b/data/business_trips_content_until_9_en.docx deleted file mode 100644 index 1d09f518827a373d64888667d504ae41c1940198..0000000000000000000000000000000000000000 Binary files a/data/business_trips_content_until_9_en.docx and /dev/null differ diff --git a/data/business_trips_content_until_end_en.docx b/data/business_trips_content_until_end_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..b04c0ff4bf5394b6103272296ddccae9f96ba2ff Binary files /dev/null and b/data/business_trips_content_until_end_en.docx differ diff --git a/data/business_trips_content_until_end_fr.docx b/data/business_trips_content_until_end_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..8db087b9b5c017baeabc7fbb98f7fce6703970c0 Binary files /dev/null and b/data/business_trips_content_until_end_fr.docx differ diff --git a/data/business_trips_plan_en.docx b/data/business_trips_plan_en.docx deleted file mode 100644 index 2fb08997e8264ca0975e2a4ff8918c6cef6e908c..0000000000000000000000000000000000000000 Binary files a/data/business_trips_plan_en.docx and /dev/null differ diff --git a/data/business_trips_plan_until_3_en.docx b/data/business_trips_plan_until_3_en.docx deleted file mode 100644 index 478c5570933cd939077ddfec2466f7c221a13149..0000000000000000000000000000000000000000 Binary files a/data/business_trips_plan_until_3_en.docx and /dev/null differ diff --git a/data/business_trips_plan_until_3_fr.docx b/data/business_trips_plan_until_3_fr.docx deleted file mode 100644 index 7a7c12cfee53a64d2051cfb77733d1ab2a4f2f16..0000000000000000000000000000000000000000 Binary files a/data/business_trips_plan_until_3_fr.docx and /dev/null differ diff --git a/data/business_trips_plan_until_9_en.docx b/data/business_trips_plan_until_end_en.docx similarity index 58% rename from data/business_trips_plan_until_9_en.docx rename to data/business_trips_plan_until_end_en.docx index 186f4577e98856cfc8e32595ad212f3b4e3fe99a..47101ab00d6b20506d0af31f6b47dbd72ae0f9e8 100644 Binary files a/data/business_trips_plan_until_9_en.docx and b/data/business_trips_plan_until_end_en.docx differ diff --git a/data/remote_rates.csv b/data/remote_rates.csv new file mode 100644 index 0000000000000000000000000000000000000000..87d7d6d1626be2424af1149031c5b3c11a394c6d --- /dev/null +++ b/data/remote_rates.csv @@ -0,0 +1,83 @@ +pays,country,rate +Afrique du sud,South Africa,10% +Algérie,Algeria,15% +Allemagne,Germany,0% +Arabie saoudite,Saudi Arabia,12% +Argentine,Argentina,11% +Australie,Australia,3% +Autriche,Austria,0% +Belgique,Belgium,0% +Bolivie,Bolivia,11% +Brésil,Brazil,11% +Bulgarie,Bulgaria,10% +Cameroun,Cameroon,13% +Canada,Canada,3% +Chili,Chile,9% +Chine,China,13% +Chypre,Cyprus,4% +Colombie,Colombia,13% +Corée,Korea,11% +Croatie,Croatia,7% +Danemark,Denmark,0% +Djibouti,Djibouti,13% +E.A.U,united arab emirates,9% +Egypte,Egypt,16% +Equateur,Ecuador,12% +Espagne,Spain,0% +Estonie,Estonia,7% +Etats-Unis,United States,3% +Ethiopie,Ethiopia,12% +Finlande,Finland,0% +Grande Bretagne,Britain,0% +Grèce,Greece,0% +Guadeloupe,Guadeloupe,3% +Guyane,Guyana,7% +Hong Kong,hong kong,8% +Hongrie,Hungary,6% +Ile Maurice,Mauritius Islands,8% +Inde,India,15% +Indonésie,Indonesia,17% +Irlande,Ireland,0% +Israël,Israel,9% +Italie,Italy,0% +Japon,Japan,8% +Jordanie,Jordan,10% +Kenya,Kenya,13% +Koweït,Kuwait,11% +Laos,Laos,13% +Luxembourg,Luxemburg,0% +Madagascar,Madagascar,13% +Malaisie,Malaysia,14% +Maroc,Morocco,8% +Martinique,Martinique,3% +Mauritanie,Mauritania,10% +Mexique,Mexico,12% +Mozambique,mozambique,14% +Nigeria,Nigeria,17% +Norvège,Norway,0% +Nouvelle Calédonie,New Caledonia,4% +Pakistan,Pakistan,17% +Pérou,Peru,13% +Philippines,Philippines,16% +Pologne,Poland,8% +Polynésie,Polynesia,5% +Portugal,Portugal,0% +Qatar,Qatar,9% +République Congo,Republic of the Congo,14% +République tchèque,Czech Republic,6% +Roumanie,Romania,11% +Russie,Russia,13% +Sénégal,Senegal,10% +Serbie,Serbia,11% +Singapour,Singapore,6% +Slovaquie,Slovakia,6% +Sri Lanka,Sri Lanka,15% +Suède,Sweden,0% +Suisse,Swiss,0% +Taiwan,taiwan,11% +Thaïlande,Thailand,12% +Tunisie,Tunisia,7% +Turquie,Türkiye,10% +Ukraine,Ukraine,12% +Venezuela,Venezuela,13% +Vietnam,Vietnam,13% \ No newline at end of file diff --git a/data/transports.docx b/data/transports.docx deleted file mode 100644 index 4c217bcc1f088225f296d0eb14eb11dad4700a44..0000000000000000000000000000000000000000 Binary files a/data/transports.docx and /dev/null differ diff --git a/data/transports_content_en.docx b/data/transports_content_en.docx deleted file mode 100644 index c99dfe94b47ff9fede7362e30550427126afdf2d..0000000000000000000000000000000000000000 Binary files a/data/transports_content_en.docx and /dev/null differ diff --git a/data/transports_content_fr.docx b/data/transports_content_fr.docx deleted file mode 100644 index 19d685875054e5a425c7a5aacc3d402d9cfad683..0000000000000000000000000000000000000000 Binary files a/data/transports_content_fr.docx and /dev/null differ diff --git a/data/transports_plan.docx b/data/transports_plan.docx deleted file mode 100644 index 8172c4ff2a5f3ecc8196c1e408afaeaa78ca55b2..0000000000000000000000000000000000000000 Binary files a/data/transports_plan.docx and /dev/null differ diff --git a/data/transports_plan_en.docx b/data/transports_plan_en.docx deleted file mode 100644 index 5905b200d041f9b0867624d89ab533c659b939ed..0000000000000000000000000000000000000000 Binary files a/data/transports_plan_en.docx and /dev/null differ diff --git a/data/transports_plan_short_en.docx b/data/transports_plan_short_en.docx deleted file mode 100644 index 5ba6548062de9c9de67c5096ca2411f59ba42214..0000000000000000000000000000000000000000 Binary files a/data/transports_plan_short_en.docx and /dev/null differ diff --git a/data/transports_plan_short_fr.docx b/data/transports_plan_short_fr.docx deleted file mode 100644 index c8b6097ff8e4c096e9e0f67e311f81c882da5cd3..0000000000000000000000000000000000000000 Binary files a/data/transports_plan_short_fr.docx and /dev/null differ diff --git a/data/~$ansports_contenu.txt b/data/~$ansports_contenu.txt deleted file mode 100644 index eda54b1681d013d66d5b5ea588fe54f213436b9a..0000000000000000000000000000000000000000 Binary files a/data/~$ansports_contenu.txt and /dev/null differ diff --git a/data/~$placementsEtVoyages.docx b/data/~$placementsEtVoyages.docx deleted file mode 100644 index 1215360c558324f168a348f977b46d5a160ee437..0000000000000000000000000000000000000000 Binary files a/data/~$placementsEtVoyages.docx and /dev/null differ diff --git a/data/~$siness_trip_plan_until_3_fr.docx b/data/~$siness_trip_plan_until_3_fr.docx deleted file mode 100644 index 3a571977a55490f9911525c3c8debdadd32e9b95..0000000000000000000000000000000000000000 Binary files a/data/~$siness_trip_plan_until_3_fr.docx and /dev/null differ diff --git a/data/~$siness_trips_content_until_3_fr.docx b/data/~$siness_trips_content_until_3_fr.docx deleted file mode 100644 index 3a571977a55490f9911525c3c8debdadd32e9b95..0000000000000000000000000000000000000000 Binary files a/data/~$siness_trips_content_until_3_fr.docx and /dev/null differ diff --git a/data/~$siness_trips_content_until_9_en.docx b/data/~$siness_trips_content_until_9_en.docx deleted file mode 100644 index 40c3d5c3d1afe4ef002c13f20160b0e1d8fc96f3..0000000000000000000000000000000000000000 Binary files a/data/~$siness_trips_content_until_9_en.docx and /dev/null differ diff --git a/data/~$ansports.docx b/data/~$siness_trips_content_until_end_en.docx similarity index 100% rename from data/~$ansports.docx rename to data/~$siness_trips_content_until_end_en.docx diff --git a/data/~$siness_trips_plan_until_9_en.docx b/data/~$siness_trips_plan_until_9_en.docx deleted file mode 100644 index 628830311cd684d862b7bebd29f7866e94ca1af9..0000000000000000000000000000000000000000 Binary files a/data/~$siness_trips_plan_until_9_en.docx and /dev/null differ diff --git a/requirements.txt b/requirements.txt index f3e5bafa3d9b245e1751b8f6d9340e9e0b20ca60..7fc1652dcace4cb66afd982b39c45adbb3fab4d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,102 +1,4 @@ -aiofiles==23.1.0 -aiohttp==3.8.4 -aiosignal==1.3.1 -altair==5.0.1 -anyio==3.7.0 -async-timeout==4.0.2 -attrs==23.1.0 -backoff==2.2.1 -certifi==2023.5.7 -charset-normalizer==3.1.0 chromadb==0.3.25 -click==8.1.3 -clickhouse-connect==0.5.25 -coloredlogs==15.0.1 -contourpy==1.0.7 -cycler==0.11.0 -dataclasses-json==0.5.7 -duckdb==0.8.0 -exceptiongroup==1.1.1 -fastapi==0.96.0 -ffmpy==0.3.0 -filelock==3.12.0 -flatbuffers==23.5.26 -fonttools==4.39.4 -frozenlist==1.3.3 -fsspec==2023.5.0 gradio==3.33.1 -gradio_client==0.2.5 -h11==0.14.0 -hnswlib==0.7.0 -httpcore==0.17.2 -httptools==0.5.0 -httpx==0.24.1 -huggingface-hub==0.15.1 -humanfriendly==10.0 -idna==3.4 -Jinja2==3.1.2 -jsonschema==4.17.3 -kiwisolver==1.4.4 langchain==0.0.190 -linkify-it-py==2.0.2 -lxml==4.9.2 -lz4==4.3.2 -markdown-it-py==2.2.0 -MarkupSafe==2.1.3 -marshmallow==3.19.0 -marshmallow-enum==1.5.1 -matplotlib==3.7.1 -mdit-py-plugins==0.3.3 -mdurl==0.1.2 -monotonic==1.6 -mpmath==1.3.0 -multidict==6.0.4 -mypy-extensions==1.0.0 -numexpr==2.8.4 -numpy==1.24.3 -onnxruntime==1.15.0 -openai==0.27.7 -openapi-schema-pydantic==1.2.4 -orjson==3.9.0 -overrides==7.3.1 -packaging==23.1 pandas==2.0.2 -Pillow==9.5.0 -posthog==3.0.1 -protobuf==4.23.2 -pydantic==1.10.8 -pydub==0.25.1 -Pygments==2.15.1 -pyparsing==3.0.9 -pyrsistent==0.19.3 -python-dateutil==2.8.2 -python-docx==0.8.11 -python-dotenv==1.0.0 -python-multipart==0.0.6 -pytz==2023.3 -PyYAML==6.0 -regex==2023.6.3 -requests==2.31.0 -semantic-version==2.10.0 -six==1.16.0 -sniffio==1.3.0 -SQLAlchemy==2.0.15 -starlette==0.27.0 -sympy==1.12 -tabulate==0.9.0 -tenacity==8.2.2 -tiktoken==0.4.0 -tokenizers==0.13.3 -toolz==0.12.0 -tqdm==4.65.0 -typing-inspect==0.9.0 -typing_extensions==4.6.3 -tzdata==2023.3 -uc-micro-py==1.0.2 -urllib3==2.0.2 -uvicorn==0.22.0 -uvloop==0.17.0 -watchfiles==0.19.0 -websockets==11.0.3 -yarl==1.9.2 -zstandard==0.21.0 diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..354bb53fbc41e36287e84242637663afbd86f280 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/control/__pycache__/control.cpython-310.pyc b/src/control/__pycache__/control.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67c1af91a1c7f3955ab0beacce33e41fba0ba453 Binary files /dev/null and b/src/control/__pycache__/control.cpython-310.pyc differ diff --git a/src/control/control.py b/src/control/control.py index 3332f4e42fc9e94cf8e12c914c28b644df2146b6..3f3b52e65d2f2a743091c0a5d1d82e0cf4bc037e 100644 --- a/src/control/control.py +++ b/src/control/control.py @@ -1,43 +1,115 @@ -import chromadb +import pandas as pd -import src.tools.retriever as rtrvr -import src.tools.llm as llm -from src.domain.doc import Doc +from src.tools.retriever import Retriever +from src.tools.llm import LlmAgent +from src.model.block import Block -chroma_client = chromadb.Client() -plan_language = 'en' -content_language = 'en' -path_plan = 'data/business_trips_plan_until_9_en.docx' -path_content = 'data/business_trips_content_until_9_en.docx' -collection_name = "until_9" +class Controller: -doc_plan = Doc(path_plan) -doc_content = Doc(path_content) -collection_ = rtrvr.init_collections(chroma_client, doc_plan, doc_content, collection_name) + def __init__(self, retriever: Retriever, llm: LlmAgent, plan_language: str, content_language: str, specials: {}): + self.plan_language = plan_language + self.content_language = content_language + self.retriever = retriever + self.specials = specials + self.llm = llm + def get_response(self, query_fr: str) -> (str, [Block]): + query = self.llm.translate(text=query_fr) if self.plan_language == 'en' else query_fr + block_sources = self.retriever.similarity_search(query=query) + block_sources = self._select_best_sources(block_sources) + for block in block_sources: + self._expand_block_with_specials(block, query_fr) + sources_contents = [s.content for s in block_sources] + context = '\n'.join(sources_contents) + answer = self.llm.generate_paragraph(query=query, context=context, language=self.content_language) + sources_contents_fr = [s.content_fr for s in block_sources[:2]] + context_fr = '\n'.join(sources_contents_fr) + if self.content_language == 'en': + answer = self.llm.generate_answer(answer_en=answer, query=query_fr, context_fr=context_fr) + answer = answer.strip().strip("'''").strip("```") + return answer, block_sources -def get_response(query): - if plan_language == 'en': - query = llm.translate(query) - sources = rtrvr.similarity_search(collection=collection_, query=query) - sources = select_best_sources(sources) - sources_contents = [s['content'] for s in sources] - context = '\n'.join(sources_contents) - answer = llm.generate_paragraph(query=query, context=context, language=content_language) - if content_language == 'en': - answer = llm.translate(text=answer, language='fr') - return answer.lstrip(), sources + @staticmethod + def _select_best_sources(sources: [Block], delta_1_2=0.1, delta_1_n=0.25, absolute=1.1, alpha=0.85) -> [Block]: + """ + Select the best sources: not far from the very best, not far from the last selected, and not too bad per se + """ + best_sources = [] + for idx, s in enumerate(sources): + if idx == 0 \ + or (s.distance - sources[idx - 1].distance < delta_1_2 + and s.distance - sources[0].distance < delta_1_n) \ + or s.distance < absolute: + best_sources.append(s) + delta_1_2 *= alpha + delta_1_n *= alpha + absolute *= alpha + else: + break + return best_sources + def _expand_block_with_specials(self, block: Block, query: str) -> Block: + """ + Performs special treatments for blocks expanding the text in the block + For example, it may add specific content extracted from a table based on elements of the query + """ -def select_best_sources(sources: [], delta_1_2=0.1, delta_1_n=0.25, absolute=1.1) -> []: - best_sources = [] - for idx, s in enumerate(sources): - if idx == 0 \ - or (s['distance_f'] - sources[idx - 1]['distance_f'] < delta_1_2 - and s['distance_f'] - sources[0]['distance_f'] < delta_1_n) \ - or s['distance_f'] < absolute: - best_sources.append(s) - return best_sources + def any_in(l1: [], l2: []) -> bool: + """ + checks if any of el in l1 belongs to l2 + """ + return 0 < len([el for el in l1 if el in l2]) + def get_countries_names(df: pd.DataFrame) -> [str]: + """ + extends the ortograph of countries: ex. Etats-Unis = USA = Etats Unis, etc. + """ + countries_fr = list(df['pays']) + countries_en = list(df['country']) + countries_names = {c_fr: [c_fr, c_en] for c_fr, c_en in zip(countries_fr, countries_en)} + countries_extensions = self.specials['countries_extensions'] + for c in set(countries_extensions.keys()).intersection(set(countries_names.keys())): + countries_names[c] += countries_extensions[c] + return countries_names + def remote_rate_fn(ctrl: Controller, block: Block, query: str) -> Block: + remote_rate_df = self.specials['remote_rate_df'] + remote_rate_known = self.specials['remote_rate_known'] + remote_rate_unknown = self.specials['remote_rate_unknown'] + countries_fr = list(remote_rate_df['pays']) + countries_names = get_countries_names(remote_rate_df) + countries_of_interest = [c for c in countries_fr if any_in(countries_names[c], query)] + for c in countries_of_interest: + rate = remote_rate_df[remote_rate_df['pays'] == c]['rate'].values[0] + block.content += remote_rate_known + c + " is " + rate + '\n' + if len(countries_of_interest) == 0: + block.content += remote_rate_unknown + return block + + def accommodation_meal_fn(ctrl: Controller, block: Block, query: str) -> Block: + accommodation_meal_df = self.specials['accommodation_meal_df'] + accommodation_meal_known = self.specials['accommodation_meal_known'] + accommodation_meal_unknown = self.specials['accommodation_meal_unknown'] + countries_fr = list(accommodation_meal_df['pays']) + countries_names = get_countries_names(df=accommodation_meal_df) + countries_of_interest = [c for c in countries_fr if any_in(countries_names[c], query)] + for c in countries_of_interest: + rate = accommodation_meal_df[accommodation_meal_df['pays'] == c][['meal', 'accommodation']].values + block.content += accommodation_meal_known + c + " is " + rate[0][0] + ' for meals and ' \ + + rate[0][1] + ' for accommodation\n' + if len(countries_of_interest) == 0: + block.content += accommodation_meal_unknown + return block + + def expand_block(special: str, ctrl: Controller, block: Block, query: str) -> Block: + routing_table = {'RemotenessRateTable': remote_rate_fn, + 'AccommodationMealTable': accommodation_meal_fn, } + if special in routing_table.keys(): + fn = routing_table[special] + block = fn(ctrl, block, query) + return block + + for special in block.specials: + block = expand_block(special, self, block, query) + return block diff --git a/src/domain/__pycache__/container.cpython-310.pyc b/src/domain/__pycache__/container.cpython-310.pyc deleted file mode 100644 index 1a86fb7609c303509fddda7a8e9d5c7cd6989244..0000000000000000000000000000000000000000 Binary files a/src/domain/__pycache__/container.cpython-310.pyc and /dev/null differ diff --git a/src/domain/__pycache__/doc.cpython-310.pyc b/src/domain/__pycache__/doc.cpython-310.pyc deleted file mode 100644 index df468f912a760375e636ee2c47dad438a92f58ac..0000000000000000000000000000000000000000 Binary files a/src/domain/__pycache__/doc.cpython-310.pyc and /dev/null differ diff --git a/src/domain/__pycache__/paragraph.cpython-310.pyc b/src/domain/__pycache__/paragraph.cpython-310.pyc deleted file mode 100644 index 271387aeb3c6d23f54200de8bf2e25c50404b911..0000000000000000000000000000000000000000 Binary files a/src/domain/__pycache__/paragraph.cpython-310.pyc and /dev/null differ diff --git a/src/domain/__pycache__/style.cpython-310.pyc b/src/domain/__pycache__/style.cpython-310.pyc deleted file mode 100644 index 915e0a5c6846062b59b9e9c705c6894fd7317f76..0000000000000000000000000000000000000000 Binary files a/src/domain/__pycache__/style.cpython-310.pyc and /dev/null differ diff --git a/src/domain/doc.py b/src/domain/doc.py deleted file mode 100644 index 89f2aafdcd1e44b5d3c52f879e0ea5b043a19160..0000000000000000000000000000000000000000 --- a/src/domain/doc.py +++ /dev/null @@ -1,71 +0,0 @@ -import docx - -from src.domain.container import Container -from src.domain.paragraph import Paragraph -from src.domain.style import Style - - -class Doc: - - def __init__(self, path='', id_=None): - - self.xdoc = docx.Document(path) - self.title = path.split('/')[-1] - self.id_ = id(self) - self.path = path - paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)] - self.container = Container(paragraphs, father=self) - self.styles = [Style(xs, self.id_, i) for (i, xs) in enumerate(self.xdoc.styles)] - - def save_as_docx(self, path): - self.xdoc.save(path) - - def apply_styles_from(self, ref_doc): - - ref_doc_styles_names = [s.xstyle.name for s in ref_doc.styles] - common_styles = [s for s in self.styles if s.xstyle.name in ref_doc_styles_names] - - for s in common_styles: - s.copy_from(ref_doc.xdoc.styles[s.xstyle.name]) - - @property - def structure(self): - - return self.container.structure - - @property - def blocks(self): - - def from_list_to_str(index_list): - index_str = str(index_list[0]) - for el in index_list[1:]: - index_str += '.' + str(el) - return index_str - - current_index = [] - blocks = [] - for block in self.container.blocks: - block['doc'] = self.title - current_level = len(current_index) - if 0 < block['level']: - if block['level'] == current_level: - current_index[-1] += 1 - elif current_level < block['level']: - current_index.append(1) - elif block['level'] < current_level: - current_index = current_index[:block['level']] - current_index[-1] += 1 - block['paragraph'] = from_list_to_str(current_index) - else: - block['paragraph'] = "0" - blocks.append(block) - return blocks - - - - - - - - - diff --git a/src/domain/project.py b/src/domain/project.py deleted file mode 100644 index 8425706b3d05184efb44e6f5ed4a8fc438387aee..0000000000000000000000000000000000000000 --- a/src/domain/project.py +++ /dev/null @@ -1,9 +0,0 @@ -from src.domain.doc import Doc - - -class Project: - - def __init__(self, name: str, docs: [Doc]): - - self.docs = docs - self.name = name diff --git a/src/domain/style.py b/src/domain/style.py deleted file mode 100644 index 41c197f070a41c0525232eb433cadfcd4e60f547..0000000000000000000000000000000000000000 --- a/src/domain/style.py +++ /dev/null @@ -1,121 +0,0 @@ -from docx.enum.style import WD_STYLE_TYPE -class Style: - - def __init__(self, xstyle, doc_id, id_): - - self.id_ = int(str(doc_id)+str(id_)) - self.xstyle = xstyle - #self.new_style = self.copy_from - - def copy_from(self, xref): # need to be further developed - - if xref.type == WD_STYLE_TYPE.PARAGRAPH: - self.xstyle.font.size = xref.font.size - self.xstyle.font.color.rgb = xref.font.color.rgb - self.xstyle.font.name = xref.font.name - self.xstyle.font.all_caps = xref.font.all_caps - # Read/write. Causes text in this font to appear in capital letters. - self.xstyle.font.bold = xref.font.bold - # Read/write. Causes text in this font to appear in bold. - self.xstyle.font.complex_script= xref.font.complex_script - # Read/write tri-state value. When True, causes the characters in - # the run to be treated as complex script regardless of their Unicode values. - # "complex script" refers to text written using a complex writing system such as Arabic, Hebrew, Tamil, - # Persian, and others.These scripts require special typesetting and handling because they have different - # writing directions, glyph connections, and letter shape variations. Word provides features that support - # these complex scripts, allowing users to easily create, edit, and format this type of text. - self.xstyle.font.cs_bold = xref.font.cs_bold - # Read/write tri-state value. When True, causes the complex script characters - # in the run to be displayed in bold typeface. - self.xstyle.font.cs_italic = xref.font.cs_italic - # Read/write tri-state value. When True, causes the complex script characters - # in the run to be displayed in italic typeface - self.xstyle.font.double_strike = xref.font.double_strike - # Read/write tri-state value. When True, causes the text in the run to appear with double strikethrough. - self.xstyle.font.emboss = xref.font.emboss - # Read/write tri-state value. When True, causes the text in the run to appear - # as if raised off the page in relief. - self.xstyle.font.hidden = xref.font.hidden - # Read/write tri-state value. When True, causes the text in the run to be hidden from display, - # unless applications settings force hidden text to be shown. - self.xstyle.font.highlight_color = xref.font.highlight_color - # A member of WD_COLOR_INDEX indicating the color of highlighting applied, - # or None if no highlighting is applied. - self.xstyle.font.imprint = xref.font.imprint - # Read/write tri-state value. When True, - # causes the text in the run to appear as if pressed into the page. - self.xstyle.font.italic = xref.font.italic - self.xstyle.font.math = xref.font.math - self.xstyle.font.no_proof = xref.font.no_proof - # Read/write tri-state value. When True, specifies that the contents of this run - # should not report any errors when the document is scanned for spelling and grammar. - self.xstyle.font.outline = xref.font.outline - # Read/write tri-state value. When True causes the characters in the run to appear as if they - # have an outline, by drawing a one pixel wide border around the inside and - # outside borders of each character glyph. - self.xstyle.font.rtl = xref.font.rtl - # Read/write tri-state value. When True causes the text in the - # run to have right-to-left characteristics. - self.xstyle.font.shadow = xref.font.shadow - self.xstyle.font.small_caps = xref.font.small_caps - self.xstyle.font.snap_to_grid = xref.font.snap_to_grid - # Read/write tri-state value. When True causes the run to use the document grid characters per line - # settings defined in the docGrid element when laying out the characters in this run. - # Snap to grid" is a layout feature that helps users align text boxes, images, or other objects precisely - # to a virtual gridline, ensuring consistent spacing and alignment of objects in a document. It improves the - # visual appearance of a document and makes it easier to read and understand. This feature is particularly - # useful for creating large documents such as reports, posters, and flyers, making them look more - # professional, organized, and readable.""" - self.xstyle.font.spec_vanish = xref.font.spec_vanish - # Read/write tri-state value. When True, specifies that the given run shall always behave as if it is - # hidden, even when hidden text is being displayed in the current document. The property has a very narrow, - # specialized use related to the table of contents. - self.xstyle.font.strike = xref.font.strike - # Read/write tri-state value. When True causes the text in the run to appear with a single horizontal line - # through the center of the line. - self.xstyle.font.subscript = xref.font.subscript - # Boolean indicating whether the characters in this Font appear as subscript. None indicates the - # subscript/subscript value is inherited from the style hierarchy. - self.xstyle.font.superscript = xref.font.superscript - self.xstyle.font.underline = xref.font.underline - self.xstyle.font.web_hidden = xref.font.web_hidden - # Using the "Web hidden" property allows us to create multiple versions of a document where some content - # can be hidden, while other content can be displayed publicly. For example, in a resume, you can use the - # "Web hidden" property to hide private information such as phone numbers and addresses. This information - # will only be displayed when an employer chooses to view it. - - self.xstyle.base_style = xref.base_style - # Style object this style inherits from or None if this style is not based on another style. - # self.xstyle.builtin = xref.builtin - self.xstyle.hidden = xref.hidden - # True if display of this style in the style gallery and list of recommended styles is suppressed. - # False otherwise. In order to be shown in the style gallery, this value must be False and quick_style - # must be True. - self.xstyle.locked = xref.locked - # True if this style is locked. not appear in the styles panel or the style gallery and cannot be applied - # to document content - self.xstyle.name = xref.name - self.xstyle.priority = xref.priority - # The integer sort key governing display sequence of this style in the Word UI. None indicates no setting - # is defined, causing Word to use the default value of 0. Style name is used as a secondary sort key to - # resolve ordering of styles having the same priority value. - # In Microsoft Word, "priority" is typically used to describe the importance of markers and comments to - # help authors and editors determine the urgency and priority of the feedback and changes being provided. - # For example, a document may use priority markers such as "high," "medium," "low," etc. - # to indicate issues that need to be addressed with a higher priority. - - self.xstyle.quick_style = xref.quick_style - # True if this style should be displayed in the style gallery when hidden is False. Read/write Boolean. - # for example, Quick Styles can be found in the "Styles" group on the "Home" tab. - # self.xstyle.type = xref.type - self.xstyle.unhide_when_used = xref.unhide_when_used - # True if an application should make this style visible the next time it is applied to content. - # False otherwise. Note that python-docx does not automatically unhide a style having True for this - # attribute when it is applied to content. - - # "unhide_when_used" can refer to a feature in Microsoft Excel. It is a cell format option that allows the - # cell to automatically show when it is being used and hide when it is not being used. This is useful when - # dealing with complex worksheets as it helps users manage and organize data better. When the user needs to - # edit or input data, the cell will automatically show, and once the user has completed the operation, the - # cell will automatically hide to better present the data. - diff --git a/src/domain/user.py b/src/domain/user.py deleted file mode 100644 index c9ca1d9aa6448c0c3e8d57bff0fdb4fe828a6d52..0000000000000000000000000000000000000000 --- a/src/domain/user.py +++ /dev/null @@ -1,4 +0,0 @@ -class User: - - def __init__(self, username, ): - self.name = username diff --git a/src/model/__pycache__/block.cpython-310.pyc b/src/model/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7b1ef75a45383ceeec977398613904889c839f6 Binary files /dev/null and b/src/model/__pycache__/block.cpython-310.pyc differ diff --git a/src/model/__pycache__/container.cpython-310.pyc b/src/model/__pycache__/container.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16b2da669083c845cee8015d99b230ffff12f173 Binary files /dev/null and b/src/model/__pycache__/container.cpython-310.pyc differ diff --git a/src/model/__pycache__/doc.cpython-310.pyc b/src/model/__pycache__/doc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6bbbddfbc26dfa0af7a3ac9a3f1bd05be4553b1 Binary files /dev/null and b/src/model/__pycache__/doc.cpython-310.pyc differ diff --git a/src/model/__pycache__/paragraph.cpython-310.pyc b/src/model/__pycache__/paragraph.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..894156804e2967173cab232a25205d9e9dd074b8 Binary files /dev/null and b/src/model/__pycache__/paragraph.cpython-310.pyc differ diff --git a/src/model/block.py b/src/model/block.py new file mode 100644 index 0000000000000000000000000000000000000000..a71eb8ce0a900339fb986cada636a06a379acf17 --- /dev/null +++ b/src/model/block.py @@ -0,0 +1,41 @@ +class Block: + def __init__(self, doc: str = '', title: str = '', content: str = '', content_fr: str = '', + index: str = '', rank: int = 0, level: int = 0, distance: float = 99999): + self.doc = doc + self.title = title + self.content = content + self.content_fr = content_fr + self.specials = [] + self.index = index + self.rank = rank + self.level = level + self.distance = distance + + def to_dict(self) -> {}: + block_dict = {'doc': self.doc, 'title': self.title, 'content': self.content, 'content_fr': self.content_fr, + 'index': self.index, 'rank': self.rank, 'level': self.level, 'distance': self.distance} + for i, s in enumerate(self.specials): + special_key = 'special_'+str(i) + block_dict[special_key] = s + block_dict['specials_len'] = len(self.specials) + return block_dict + + def from_dict(self, block_dict: {}): + self.doc = block_dict['doc'] + self.title = block_dict['title'] + self.content = block_dict['content'] + self.content_fr = block_dict['content_fr'] + self.index = block_dict['index'] + self.rank = block_dict['rank'] + self.level = block_dict['level'] + self.distance = block_dict['distance'] + self.specials = [] + for i in range(block_dict['specials_len']): + special_key = 'special_' + str(i) + self.specials.append(block_dict[special_key]) + return self + + @property + def distance_str(self) -> str: + return format(self.distance, '.2f') + diff --git a/src/domain/container.py b/src/model/container.py similarity index 86% rename from src/domain/container.py rename to src/model/container.py index d75f05fc75423db3424a8cf340d9f0b4de69532d..8888be9f4493320ac7cce14bde3ed997a5af71c3 100644 --- a/src/domain/container.py +++ b/src/model/container.py @@ -1,6 +1,7 @@ -from src.domain.paragraph import Paragraph +from src.model.paragraph import Paragraph +from src.model.block import Block -INFINITE = 10000 +INFINITE = 99999 class Container: @@ -16,6 +17,7 @@ class Container: self.id_ = int(str(1) + str(father.id_) + str(id_)) if paragraphs: self.paragraphs, self.children = self.create_children(paragraphs, level, rank + 1) + self.blocks = self.get_blocks() @property def text(self): @@ -44,14 +46,18 @@ class Container: text_chunks += child.text_chunks return text_chunks - @property - def blocks(self): - block = {'content': "", 'rank': self.rank, 'level': self.level, 'title': ''} + def get_blocks(self): + block = Block(rank=self.rank, level=self.level) if self.title: - block['title'] = self.title.text + block.title = self.title.text for p in self.paragraphs: - block['content'] += p.text + '. ' - blocks = [block] + if not p.blank: + if p.text.startswith('##### '): + special_action = p.text.lstrip('##### ') + block.specials.append(special_action) + else: + block.content += p.text + '. ' + blocks = [block] if block.content or block.specials else [] for child in self.children: blocks += child.blocks return blocks @@ -67,7 +73,7 @@ class Container: return toc def move(self, position: int, new_father=None): - current_father = self.father # should be added in the domain + current_father = self.father # should be added in the model current_father.children.remove(self) self.rank = new_father.rank + 1 if new_father else 0 diff --git a/src/model/doc.py b/src/model/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..fa7a5ea0539ce8d0e33dcbce96b0136955d17236 --- /dev/null +++ b/src/model/doc.py @@ -0,0 +1,49 @@ +import docx + +from src.model.container import Container +from src.model.paragraph import Paragraph + + +class Doc: + + def __init__(self, path='', id_=None): + + self.xdoc = docx.Document(path) + self.title = path.split('/')[-1] + self.id_ = id(self) + self.path = path + paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)] + self.container = Container(paragraphs, father=self) + self.blocks = self.get_blocks() + + @property + def structure(self): + + return self.container.structure + + def get_blocks(self): + + def from_list_to_str(index_list): + index_str = str(index_list[0]) + for el in index_list[1:]: + index_str += '.' + str(el) + return index_str + + current_index = [] + blocks = self.container.blocks + for block in blocks: + block.doc = self.title + current_level = len(current_index) + if 0 < block.level: + if block.level == current_level: + current_index[-1] += 1 + elif current_level < block.level: + current_index.append(1) + elif block.level < current_level: + current_index = current_index[:block.level] + current_index[-1] += 1 + block.index = from_list_to_str(current_index) + else: + block.index = "0" + blocks.remove(block) + return blocks diff --git a/src/domain/paragraph.py b/src/model/paragraph.py similarity index 73% rename from src/domain/paragraph.py rename to src/model/paragraph.py index 0f0053f8cb93d2935b6825dcf8b88f7afd4b30a4..b8e30cfbc9ca93979467a314d365a4cd18e2aca3 100644 --- a/src/domain/paragraph.py +++ b/src/model/paragraph.py @@ -1,3 +1,5 @@ +import string + INFINITE = 10000 @@ -25,3 +27,11 @@ class Paragraph: 'level': self.level, }} return structure + + @property + def blank(self): + """ + checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored) + """ + text = self.text.replace('\n', '') + return set(text).isdisjoint(string.ascii_letters) diff --git a/src/tools/__pycache__/llm.cpython-310.pyc b/src/tools/__pycache__/llm.cpython-310.pyc index fd6ccfeed211ae1374cf9c8c11d4b78d254454c4..422a36f18d36098035e1e26d38ea156f0a513f23 100644 Binary files a/src/tools/__pycache__/llm.cpython-310.pyc and b/src/tools/__pycache__/llm.cpython-310.pyc differ diff --git a/src/tools/__pycache__/retriever.cpython-310.pyc b/src/tools/__pycache__/retriever.cpython-310.pyc index 95e5165b27cd356a7cf88d21f9b99e0b879cf64e..b2fa26a3187ffca23008a0342410ad0ad34b48d3 100644 Binary files a/src/tools/__pycache__/retriever.cpython-310.pyc and b/src/tools/__pycache__/retriever.cpython-310.pyc differ diff --git a/src/tools/llm.py b/src/tools/llm.py index 369fe406bee54dfd618c8a6d5f6c87c0495718e7..ec5f223c80f66316138c50525b57c585b81624ea 100644 --- a/src/tools/llm.py +++ b/src/tools/llm.py @@ -1,63 +1,45 @@ -import os +class LlmAgent: -from langchain.llms import OpenAI + def __init__(self, llm): + self.llm = llm -OpenAI_KEY = "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" -os.environ["OPENAI_API_KEY"] = OpenAI_KEY -os.environ["TOKENIZERS_PARALLELISM"] = "true" -openai_llm = OpenAI(temperature=0) + def generate_paragraph(self, query: str, context: {}, language='fr') -> str: + """generates the final answer""" + template = (f" You are an agent designed to answer to the {query} based on the context delimited by triple backticks:\n" + f"``` {context}```\n" + f" The response shall be in {language} and shall be concise and based on the context provided\n" + f" In case the provided context is not relevant to answer to the question, just return that you " + f" don't know the answer ") + p = self.llm(template) + return p -def generate_paragraph(query: str, context: {}, language='fr') -> str: - """generates the final answer""" + def translate(self, text: str, language="en") -> str: + """translates""" - template = (f" Your task consists in generating a response in {language}\\n" - f" to the following query: ```{query}```\n" - f"\n" - f" Documents provided provided below delimited by triple backticks gives you the context: \n" - f" delimited by triple backticks: ``` {context}``` \n" - f" The response shall be concise and factual\n" - ) + languages = "french to english" if language == "en" else "english to french" - temp2 = (f" You are an agent designed to answer to the {query} based on the context delimited by triple backticks:\n" - f"``` {context}```\n" - f" The response shall be in {language} and shall be concise and factual\n" - f" In case the provided context does not seem relevant to answer to the question, just return that you " - f" don't know the answer ") + template = (f" Your task consists in translating {languages}\\n" + f" the following text delimited by by triple backticks: ```{text}```\n" + ) - llm = openai_llm - p = llm(temp2) + p = self.llm(template) + return p - return p + def generate_answer(self, query: str, answer_en: str, context_fr: str) -> str: + """provides the final answer in french based on the initial query and the answer in english""" + def _cut_unfinished_sentence(s: str): + return '.'.join(p.split('.')[:-1]) -def translate(text: str, language="en") -> str: - """translates""" + template = (f"Your task consists in providing the answer in french to the query " + f"delimited by triple backticks: ```{query}``` given the informations here delimited " + f"by triple backticks: ```{context_fr}``` and the answer in english delimited by triple " + f"backticks: ```{answer_en}```" + ) + print(template) + p = self.llm(template) + p = _cut_unfinished_sentence(p) + print(p) + return p - languages = "french to english" if language == "en" else "english to french" - - template = (f" Your task consists in translating {languages}\\n" - f" the following text: ```{text}```\n" - f" delimited by by triple backticks" - ) - - llm = openai_llm - p = llm(template) - - return p - - -def generate_paragraph2(query_and_context: str, language: str = 'fr') -> str: - """generates the final answer""" - - temp2 = (f" You are an agent designed to answer based on the query and content given below and delimited by triple" - f" backticks:\n" - f"``` {query_and_context}```\n" - f" The response shall be in {language} and shall be concise and factual\n" - f" In case the provided context does not seem relevant to answer to the question, just return that you " - f" don't know the answer ") - - llm = openai_llm - p = llm(temp2) - - return p diff --git a/src/tools/retriever.py b/src/tools/retriever.py index ea089761ddedcc17da8a14f986fdb8f3977c35fd..3b78784c30ecbcc7df90d18499275455632c227c 100644 --- a/src/tools/retriever.py +++ b/src/tools/retriever.py @@ -1,29 +1,31 @@ - - -TOKENIZERS_PARALLELISM = True - - -def init_collections(vs_client, doc_path, doc_content, collection_name): - plan_blocks = doc_path.blocks - content_blocks = doc_content.blocks - collection_ = vs_client.create_collection(name=collection_name) - collection_.add( - documents=[b['content'] for b in plan_blocks], - ids=[b['paragraph'] for b in plan_blocks], - metadatas=content_blocks - ) - return collection_ - - -def similarity_search(collection: object, query: str) -> {}: - res = collection.query(query_texts=query) - sources = res['metadatas'][0] - distances = res['distances'][0] - for s, d in zip(sources, distances): - s['distance_f'] = d - s['distance'] = format(d, '.2f') - return sources - - - - +from src.model.doc import Doc +from src.model.block import Block + + +class Retriever: + + def __init__(self, db_client, plan_doc: Doc, content_doc: Doc, content_fr_doc: Doc, collection_name: str): + plan_blocks: [Block] = plan_doc.blocks + content_blocks: [Block] = content_doc.blocks + content_fr_blocks: [Block] = content_fr_doc.blocks + for pb, cb in zip(plan_blocks, content_blocks): + cb.specials = pb.specials + for cb, cb_fr in zip(content_blocks, content_fr_blocks): + cb.content_fr = cb_fr.content + self.collection = db_client.create_collection(name=collection_name) + self.collection.add( + documents=[block.content for block in plan_blocks], + ids=[block.index for block in plan_blocks], + metadatas=[block.to_dict() for block in content_blocks] + ) + + def similarity_search(self, query: str) -> {}: + res = self.collection.query(query_texts=query) + block_dict_sources = res['metadatas'][0] + distances = res['distances'][0] + blocks = [] + for bd, d in zip(block_dict_sources, distances): + b = Block().from_dict(bd) + b.distance = d + blocks.append(b) + return blocks diff --git a/src/view/__pycache__/view.cpython-310.pyc b/src/view/__pycache__/view.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2627c09f3e247b6abb2eb4b0d247e70bcb60a7c5 Binary files /dev/null and b/src/view/__pycache__/view.cpython-310.pyc differ diff --git a/src/view/view.py b/src/view/view.py new file mode 100644 index 0000000000000000000000000000000000000000..e828e4081ca1ba1f9dacd1c9914be3541fda78af --- /dev/null +++ b/src/view/view.py @@ -0,0 +1,74 @@ +import gradio as gr +from src.control.control import Controller + + +def run(ctrl: Controller, examples: {}): + with gr.Blocks() as qna: + with gr.Row(): + with gr.Column(): + pass + + with gr.Column(scale=10): + """ + 1. input docs components + """ + + gr.Markdown("# Questions sur le vivre ensemble en entreprise") + + input_text_comp = gr.Textbox( + label="", + lines=1, + max_lines=3, + interactive=True, + placeholder="Posez votre question ici", + ) + input_example_comp = gr.Radio( + label="Examples de questions", + choices=list(examples.keys()), + ) + output_text_comp = gr.Textbox( + label="La réponse automatique", + lines=12, + max_lines=12, + interactive=False, + visible=False, + ) + sources_comp = gr.CheckboxGroup( + label="Documents sources", + visible=False, + interactive=False, + ) + + with gr.Column(): + pass + + def input_text_fn1(): + update_ = { + output_text_comp: gr.update(visible=True), + } + return update_ + + def input_text_fn2(input_text_): + answer, sources = ctrl.get_response(query_fr=input_text_) + source_labels = [s.distance_str + ' ' + s.index + ' ' + s.title + ' from ' + s.doc for s in sources] + update_ = { + output_text_comp: gr.update(value=answer), + sources_comp: gr.update(visible=True, choices=source_labels, value=source_labels) + } + return update_ + + def input_example_fn(input_example_): + update_ = { + input_text_comp: gr.update(value=examples[input_example_]), + output_text_comp: gr.update(visible=True), + } + return update_ + + input_text_comp \ + .submit(input_text_fn1, inputs=[], outputs=[output_text_comp]) \ + .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp]) + input_example_comp \ + .change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp]) \ + .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp]) + + return qna