diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/data/.DS_Store differ diff --git a/data/AccomodationAndMealsForfaits_en.csv b/data/AccomodationAndMealsForfaits_en.csv new file mode 100644 index 0000000000000000000000000000000000000000..e9499ec401c3eb01e2c72798a42621745a805c5e Binary files /dev/null and b/data/AccomodationAndMealsForfaits_en.csv differ diff --git a/data/AccomodationAndMealsForfaits_en.numbers b/data/AccomodationAndMealsForfaits_en.numbers new file mode 100755 index 0000000000000000000000000000000000000000..dc1284a45b900a6b1671ed70d588c363b2b521fb Binary files /dev/null and b/data/AccomodationAndMealsForfaits_en.numbers differ diff --git a/data/AccomodationAndMealsForfaits_fr.csv b/data/AccomodationAndMealsForfaits_fr.csv new file mode 100644 index 0000000000000000000000000000000000000000..8b71a64f48ab37711f5a9f8ec7d2867f5c0d9d35 --- /dev/null +++ b/data/AccomodationAndMealsForfaits_fr.csv @@ -0,0 +1,31 @@ +Destination;Hebergement;Repas +France;125;27 +Allemagne;150;35 +Arabie Saoudite;200;40 +Autriche;110;40 +Belgique;150;35 +Canada;150;30 +Chine;113;37 +Egypte;150;25 +Emirats Arabes Unis;160;46 +Espagne;130;30 +Etats-Unis;140;47 +GrŹce;140;25 +Inde;160;47 +Irlande;180;30 +Italie;120;37 +Japon;150;25 +Maroc;110;25 +Mexique;130;27 +NorvŹge;160;40 +Pays-Bas;150;32 +Pologne;110;23 +Portugal;108;25 +Qatar;210;35 +Royaume-Uni;130;28 +Russie;180;50 +Singapour;170;42 +SuŹde;90;30 +Suisse;192;35 +Taiwan;123;37 +Turquie;150;28 \ No newline at end of file diff --git a/data/AccomodationAndMealsForfaits_fr.numbers b/data/AccomodationAndMealsForfaits_fr.numbers new file mode 100755 index 0000000000000000000000000000000000000000..5269dd5685195ad2731beccaf11b68ec1612fd36 Binary files /dev/null and b/data/AccomodationAndMealsForfaits_fr.numbers differ diff --git a/data/BaremeTauxEloignement.csv b/data/BaremeTauxEloignement.csv new file mode 100644 index 0000000000000000000000000000000000000000..e03e92d3f006acf8ebf53734650d76dbddf81b65 --- /dev/null +++ b/data/BaremeTauxEloignement.csv @@ -0,0 +1,84 @@ +Tableau 1 +Barème Taux d’Éloignement; +Afrique du sud;10 % +AlgĂ©rie;15 % +Allemagne;0 % +Arabie saoudite;12 % +Argentine;11 % +Australie;3 % +Autriche;0 % +Belgique;0 % +Bolivie;11 % +BrĂ©sil;11 % +Bulgarie;10 % +Cameroun;13 % +Canada;3 % +Chili;9 % +Chine;13 % +Chypre;4 % +Colombie;13 % +CorĂ©e;11 % +Croatie;7 % +Danemark;0 % +Djibouti;13 % +E.A.U;9 % +Egypte;16 % +Equateur;12 % +Espagne;0 % +Estonie;7 % +Etats unis;3 % +Ethiopie;12 % +Finlande;0 % +Grande Bretagne;0 % +Grèce;0 % +Guadeloupe;3 % +Guyane;7 % +Hong Kong;8 % +Hongrie;6 % +Ile Maurice;8 % +Inde;15 % +IndonĂ©sie;17 % +Irlande;0 % +IsraĂ«l;9 % +Italie;0 % +Japon;8 % +Jordanie;10 % +Kenya;13 % +KoweĂŻt;11 % +Laos;13 % +Luxembourg;0 % +Madagascar;13 % +Malaisie;14 % +Maroc;8 % +Martinique;3 % +Mauritanie;10 % +Mexique;12 % +Mozambique;14 % +Nigeria;17 % +Norvège;0 % +Nouvelle CalĂ©donie;4 % +Pakistan;17 % +PĂ©rou;13 % +Philippines;16 % +Pologne;8 % +PolynĂ©sie;5 % +Portugal;0 % +Qatar;9 % +RĂ©publique Congo;14 % +RĂ©publique tchèque;6 % +Roumanie;11 % +Russie;13 % +SĂ©nĂ©gal;10 % +Serbie;11 % +Singapour;6 % +Slovaquie;6 % +Sri Lanka;15 % +Suède;0 % +Suisse;0 % +Taiwan;11 % +ThaĂŻlande;12 % +Tunisie;7 % +Turquie;10 % +Ukraine;12 % +Venezuela;13 % +Vietnam;13 % \ No newline at end of file diff --git a/data/DeplacementsEtVoyages.docx b/data/DeplacementsEtVoyages.docx new file mode 100644 index 0000000000000000000000000000000000000000..181cb8e00142b59479fab2a81275a966489370c2 Binary files /dev/null and b/data/DeplacementsEtVoyages.docx differ diff --git a/data/DeplacementsEtVoyagesRev.docx b/data/DeplacementsEtVoyagesRev.docx new file mode 100644 index 0000000000000000000000000000000000000000..84944fcd81933a7f377b04bf6f7f3ac863e21a3a Binary files /dev/null and b/data/DeplacementsEtVoyagesRev.docx differ diff --git a/data/ForfaitsRemboursements.csv b/data/ForfaitsRemboursements.csv new file mode 100644 index 0000000000000000000000000000000000000000..921485d633c352ab252f70c9ea8954c384e1fca3 --- /dev/null +++ b/data/ForfaitsRemboursements.csv @@ -0,0 +1,31 @@ +Destination;;HĂ©bergement;;Repas +France;;IDF 125€ / Province 100€;;27 € +Allemagne;;150 € ;;35 € +Arabie Saoudite;;200 € ;;40 € +Autriche;;110 € ;;40 € +Belgique;;150 € ;;35 € +Canada;;150 € ;;30 € +Chine;;113 € ;;37 € +Egypte;;150 € ;;25 € +Emirats Arabes Unis;;160 € ;;46 € +Espagne;;130 € ;;30 € +Etats-Unis;;140 € ;;47 € +Grèce;;140 € ;;25 € +Inde;;160 € ;;47 € +Irlande;;180 € ;;30 € +Italie;;120 € ;;37 € +Japon;;150 € ;;25 € +Maroc;;110 € ;;25 € +Mexique;;130 € ;;27 € +Norvège;;160 € ;;40 € +Pays-Bas;;150 € ;;32 € +Pologne;;110 € ;;23 € +Portugal;;108 € ;;25 € +Qatar;;210 € ;;35 € +Royaume-Uni;;130 € ;;28 € +Russie;;180 € ;;50 € +Singapour;;170 € ;;42 € +Suède;;90 € ;;30 € +Suisse;;192 € ;;35 € +Taiwan;;123 € ;;37 € +Turquie;;150 € ;;28 € \ No newline at end of file diff --git a/data/NonPrisEnCharge.csv b/data/NonPrisEnCharge.csv new file mode 100644 index 0000000000000000000000000000000000000000..a1f8f4f34a2c7ac4dc7a71e3c069c02d05208fd6 --- /dev/null +++ b/data/NonPrisEnCharge.csv @@ -0,0 +1,14 @@ +Non pris en charge via Note de Frais +"MatĂ©riel informatique : tĂ©lĂ©phone, chargeur, tablette, adaptateur prise, etc." +"Outillage : balai, tournevis, disque de disqueuse, etc." +"Mobilier/AmĂ©nagement de bureau : plantes, dalles, poufs, etc." +"Fournitures de bureau : cafĂ©, piles, etc." +ConfĂ©rence/Cotisation +SĂ©minaire/RĂ©union Team Building +Doublon de clĂ©s +Equipement de Protection Individuelle (EPI) +Achat de bagagerie : neuf/perdu/endommagĂ© +Lavage et recharge Carte de Lavage : tous types de vĂ©hicule +Consommation alcoolisĂ©e +"Collation : confiserie, gâteau, boisson, etc." +"Prestation de loisirs : Spa, piscine, massage, remontĂ©e mĂ©canique, escape game, etc." \ No newline at end of file diff --git a/data/business_trips_content_en.docx b/data/business_trips_content_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..c4e026068990cec5ffbff745b26d46d8fcb8c8bf Binary files /dev/null and b/data/business_trips_content_en.docx differ diff --git a/data/business_trips_content_fr.docx b/data/business_trips_content_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..68da3f932c463366391858706e7bffe5c5de4b64 Binary files /dev/null and b/data/business_trips_content_fr.docx differ diff --git a/data/business_trips_content_until_3_en.docx b/data/business_trips_content_until_3_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..ff314c0a40d3875042473fd7a3405533f32a9dff Binary files /dev/null and b/data/business_trips_content_until_3_en.docx differ diff --git a/data/business_trips_content_until_3_enfr.docx b/data/business_trips_content_until_3_enfr.docx new file mode 100644 index 0000000000000000000000000000000000000000..849b7f3f3655e252f1d712f2d7a68e215e36aa5a Binary files /dev/null and b/data/business_trips_content_until_3_enfr.docx differ diff --git a/data/business_trips_content_until_3_fr.docx b/data/business_trips_content_until_3_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..4c4c5e99f7f59973133a7bcec232492defa8382a Binary files /dev/null and b/data/business_trips_content_until_3_fr.docx differ diff --git a/data/business_trips_content_until_9_en.docx b/data/business_trips_content_until_9_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..1d09f518827a373d64888667d504ae41c1940198 Binary files /dev/null and b/data/business_trips_content_until_9_en.docx differ diff --git a/data/business_trips_plan_en.docx b/data/business_trips_plan_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..2fb08997e8264ca0975e2a4ff8918c6cef6e908c Binary files /dev/null and b/data/business_trips_plan_en.docx differ diff --git a/data/business_trips_plan_until_3_en.docx b/data/business_trips_plan_until_3_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..478c5570933cd939077ddfec2466f7c221a13149 Binary files /dev/null and b/data/business_trips_plan_until_3_en.docx differ diff --git a/data/business_trips_plan_until_3_fr.docx b/data/business_trips_plan_until_3_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..7a7c12cfee53a64d2051cfb77733d1ab2a4f2f16 Binary files /dev/null and b/data/business_trips_plan_until_3_fr.docx differ diff --git a/data/business_trips_plan_until_9_en.docx b/data/business_trips_plan_until_9_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..186f4577e98856cfc8e32595ad212f3b4e3fe99a Binary files /dev/null and b/data/business_trips_plan_until_9_en.docx differ diff --git a/data/transports.docx b/data/transports.docx new file mode 100644 index 0000000000000000000000000000000000000000..4c217bcc1f088225f296d0eb14eb11dad4700a44 Binary files /dev/null and b/data/transports.docx differ diff --git a/data/transports_content_en.docx b/data/transports_content_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..c99dfe94b47ff9fede7362e30550427126afdf2d Binary files /dev/null and b/data/transports_content_en.docx differ diff --git a/data/transports_content_fr.docx b/data/transports_content_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..19d685875054e5a425c7a5aacc3d402d9cfad683 Binary files /dev/null and b/data/transports_content_fr.docx differ diff --git a/data/transports_plan.docx b/data/transports_plan.docx new file mode 100644 index 0000000000000000000000000000000000000000..8172c4ff2a5f3ecc8196c1e408afaeaa78ca55b2 Binary files /dev/null and b/data/transports_plan.docx differ diff --git a/data/transports_plan_en.docx b/data/transports_plan_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..5905b200d041f9b0867624d89ab533c659b939ed Binary files /dev/null and b/data/transports_plan_en.docx differ diff --git a/data/transports_plan_short_en.docx b/data/transports_plan_short_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..5ba6548062de9c9de67c5096ca2411f59ba42214 Binary files /dev/null and b/data/transports_plan_short_en.docx differ diff --git a/data/transports_plan_short_fr.docx b/data/transports_plan_short_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..c8b6097ff8e4c096e9e0f67e311f81c882da5cd3 Binary files /dev/null and b/data/transports_plan_short_fr.docx differ diff --git a/data/~$ansports.docx b/data/~$ansports.docx new file mode 100644 index 0000000000000000000000000000000000000000..1215360c558324f168a348f977b46d5a160ee437 Binary files /dev/null and b/data/~$ansports.docx differ diff --git a/data/~$ansports_contenu.txt b/data/~$ansports_contenu.txt new file mode 100644 index 0000000000000000000000000000000000000000..eda54b1681d013d66d5b5ea588fe54f213436b9a Binary files /dev/null and b/data/~$ansports_contenu.txt differ diff --git a/data/~$placementsEtVoyages.docx b/data/~$placementsEtVoyages.docx new file mode 100644 index 0000000000000000000000000000000000000000..1215360c558324f168a348f977b46d5a160ee437 Binary files /dev/null and b/data/~$placementsEtVoyages.docx differ diff --git a/data/~$siness_trip_plan_until_3_fr.docx b/data/~$siness_trip_plan_until_3_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..3a571977a55490f9911525c3c8debdadd32e9b95 Binary files /dev/null and b/data/~$siness_trip_plan_until_3_fr.docx differ diff --git a/data/~$siness_trips_content_until_3_fr.docx b/data/~$siness_trips_content_until_3_fr.docx new file mode 100644 index 0000000000000000000000000000000000000000..3a571977a55490f9911525c3c8debdadd32e9b95 Binary files /dev/null and b/data/~$siness_trips_content_until_3_fr.docx differ diff --git a/data/~$siness_trips_content_until_9_en.docx b/data/~$siness_trips_content_until_9_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..40c3d5c3d1afe4ef002c13f20160b0e1d8fc96f3 Binary files /dev/null and b/data/~$siness_trips_content_until_9_en.docx differ diff --git a/data/~$siness_trips_plan_until_9_en.docx b/data/~$siness_trips_plan_until_9_en.docx new file mode 100644 index 0000000000000000000000000000000000000000..628830311cd684d862b7bebd29f7866e94ca1af9 Binary files /dev/null and b/data/~$siness_trips_plan_until_9_en.docx differ diff --git a/src/__pycache__/control.cpython-310.pyc b/src/__pycache__/control.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4731bb228d5b9749f004a804981477f5ad64752b Binary files /dev/null and b/src/__pycache__/control.cpython-310.pyc differ diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000000000000000000000000000000000000..ca177a70a0882e766505918a97d633f698e2e97b --- /dev/null +++ b/src/app.py @@ -0,0 +1,91 @@ +import gradio as gr + + +import src.control as ctrl + + +""" +================================== +A. Component part +================================== +""" + +with gr.Blocks() as hrqa: + + with gr.Row(): + + with gr.Column(): + pass + + with gr.Column(scale=10): + """ + 1. input docs components + """ + + gr.Markdown("# Questions sur le vivre ensemble en entreprise") + + input_text_comp = gr.Textbox( + label="", + lines=1, + max_lines=3, + interactive=True, + placeholder="Posez votre question ici", + ) + input_example_comp = gr.Radio( + label="Examples de questions", + choices=["Remboursement de frais de voiture", "Recommandations de transport"], + ) + output_text_comp = gr.Textbox( + label="La rĂ©ponse automatique", + lines=2, + max_lines=10, + interactive=False, + visible=False, + ) + sources_comp = gr.CheckboxGroup( + label="Documents sources", + visible=False, + interactive=False, + ) + + with gr.Column(): + pass + + + def input_text_fn1(): + update_ = { + output_text_comp: gr.update(visible=True), + } + return update_ + + def input_text_fn2(input_text_): + answer, sources = ctrl.get_response(query=input_text_) + source_labels = [s['distance']+' '+s['paragraph']+' '+s['title']+' from '+s['doc'] for s in sources] + update_ = { + output_text_comp: gr.update(value=answer), + sources_comp: gr.update(visible=True, choices=source_labels, value=source_labels) + } + return update_ + + + def input_example_fn(input_example_): + examples = { + "Remboursement de frais de voiture": "Comment sont remboursĂ©s mes frais kilomĂ©triques sur mes trajets " + "professionnels?", + "Recommandations de transport": "Quelles sont les recommandations de l'entreprise? Vaut-il mieux voyager en " + "train ou en avion?" + } + update_ = { + input_text_comp: gr.update(value=examples[input_example_]), + output_text_comp: gr.update(visible=True), + } + return update_ + + input_text_comp\ + .submit(input_text_fn1, inputs=[], outputs=[output_text_comp])\ + .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp]) + input_example_comp\ + .change(input_example_fn, inputs=[input_example_comp], outputs=[input_text_comp, output_text_comp])\ + .then(input_text_fn2, inputs=[input_text_comp], outputs=[output_text_comp, sources_comp]) + +hrqa.queue().launch() \ No newline at end of file diff --git a/src/app2.py b/src/app2.py new file mode 100644 index 0000000000000000000000000000000000000000..b8f6c5fd78eb1c1d93eacfd4872f35bcf5374624 --- /dev/null +++ b/src/app2.py @@ -0,0 +1,16 @@ +from langchain.agents import create_csv_agent +from langchain.agents import create_pandas_dataframe_agent +import src.tools.llm as llm + +import pandas as pd + +path = '../data/AccomodationAndMealsForfaits_en.csv' +#path = '../data/test_utf32.csv' +df = pd.read_csv(path, encoding='utf32', sep=";") +agent = create_pandas_dataframe_agent(llm.OpenAI(temperature=0), df, verbose=True) +refund = agent.run("Quel est le remboursement pour un repas en Turkiye?") +print(refund) + + + +pass diff --git a/src/control.py b/src/control.py new file mode 100644 index 0000000000000000000000000000000000000000..2a6fcebfd8ea636f17a5f35914e0d92baa111751 --- /dev/null +++ b/src/control.py @@ -0,0 +1,49 @@ +import chromadb + +import src.tools.retriever as rtrvr +import src.tools.llm as llm +from src.domain.doc import Doc + +chroma_client = chromadb.Client() + +plan_language = 'en' +content_language = 'en' +path_plan = '../data/business_trips_plan_until_9_en.docx' +path_content = '../data/business_trips_content_until_9_en.docx' +collection_name = "until_9" + +doc_plan = Doc(path_plan) +doc_content = Doc(path_content) +collection_ = rtrvr.init_collections(chroma_client, doc_plan, doc_content, collection_name) + + +def get_response(query): + if plan_language == 'en': + query = llm.translate(query) + sources = rtrvr.similarity_search(collection=collection_, query=query) + sources = select_best_sources(sources) + sources_contents = [s['content'] for s in sources] + context = '\n'.join(sources_contents) + answer = llm.generate_paragraph(query=query, context=context, language=content_language) + if content_language == 'en': + answer = llm.translate(text=answer, language='fr') + return answer.lstrip(), sources + + +def select_best_sources(sources: [], delta_1_2=0.1, delta_1_n=0.25, absolute=1.1) -> []: + best_sources = [] + for idx, s in enumerate(sources): + if idx == 0 \ + or (s['distance_f'] - sources[idx - 1]['distance_f'] < delta_1_2 + and s['distance_f'] - sources[0]['distance_f'] < delta_1_n) \ + or s['distance_f'] < absolute: + best_sources.append(s) + return best_sources + + +q1 = "Comment sont remboursĂ©s mes frais kilomĂ©triques sur mes dĂ©placements avec mon vĂ©hicule personnel?" +q2 = "Quels sont les moyens de transport recommandĂ©s par la sociĂ©tĂ©?" +q3 = "est-ce que mes billets de cinĂ©ma peuvent ĂŞtre remboursĂ©s?" + +a2 = get_response(q3) +print(a2) diff --git a/src/control2.py b/src/control2.py new file mode 100644 index 0000000000000000000000000000000000000000..f24b93f88dbaa056a364d83dfd1a156cea46470d --- /dev/null +++ b/src/control2.py @@ -0,0 +1,36 @@ +from langchain.agents import AgentType, initialize_agent +from langchain.tools import BaseTool, StructuredTool, Tool, tool + + +from src.control import * + + +@tool +def similarity_search(query: str) -> str: + """ + useful for when you look for relevant content about business trip policy : transport, accomodation, etc. + """ + query = llm.translate(query) + sources = rtrvr.similarity_search(collection=collection_, query=query) + sources = select_best_sources(sources) + sources_contents = [s['content'] for s in sources] + context = '\n'.join(sources_contents) + return context + + +@tool +def generate_answer(query_and_context: str) -> str: + """ + useful for when you have a query and the relevant content to generate an answer + """ + answer = llm.generate_paragraph2(query_and_context=query_and_context, language='en') + answer = llm.translate(text=answer, language='fr') + return answer.lstrip() + + +tools = [similarity_search, generate_answer] + +agent = initialize_agent(tools, llm.openai_llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True) +q1 = "Comment sont remboursĂ©s mes frais kilomĂ©triques sur mes dĂ©placements avec mon vĂ©hicule personnel?" +q2 = "Quels sont les moyens de transport recommandĂ©s par la sociĂ©tĂ©?" +ans = agent.run(q2) diff --git a/src/domain/__pycache__/container.cpython-310.pyc b/src/domain/__pycache__/container.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a86fb7609c303509fddda7a8e9d5c7cd6989244 Binary files /dev/null and b/src/domain/__pycache__/container.cpython-310.pyc differ diff --git a/src/domain/__pycache__/doc.cpython-310.pyc b/src/domain/__pycache__/doc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df468f912a760375e636ee2c47dad438a92f58ac Binary files /dev/null and b/src/domain/__pycache__/doc.cpython-310.pyc differ diff --git a/src/domain/__pycache__/paragraph.cpython-310.pyc b/src/domain/__pycache__/paragraph.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..271387aeb3c6d23f54200de8bf2e25c50404b911 Binary files /dev/null and b/src/domain/__pycache__/paragraph.cpython-310.pyc differ diff --git a/src/domain/__pycache__/style.cpython-310.pyc b/src/domain/__pycache__/style.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..915e0a5c6846062b59b9e9c705c6894fd7317f76 Binary files /dev/null and b/src/domain/__pycache__/style.cpython-310.pyc differ diff --git a/src/domain/container.py b/src/domain/container.py new file mode 100644 index 0000000000000000000000000000000000000000..d75f05fc75423db3424a8cf340d9f0b4de69532d --- /dev/null +++ b/src/domain/container.py @@ -0,0 +1,136 @@ +from src.domain.paragraph import Paragraph + +INFINITE = 10000 + + +class Container: + + def __init__(self, paragraphs: [Paragraph], title: Paragraph = None, level: int = 0, rank: int = 0, father=None, + id_=0): + self.level = level + self.title = title + self.paragraphs = [] + self.children = [] + self.rank = rank + self.father = father # if not father, then the container is at the top of the hierarchy + self.id_ = int(str(1) + str(father.id_) + str(id_)) + if paragraphs: + self.paragraphs, self.children = self.create_children(paragraphs, level, rank + 1) + + @property + def text(self): + text = "" + if self.title: + text = "Titre " + str(self.level) + " : " + self.title.text + '\n' + for p in self.paragraphs: + text += p.text + '\n' + for child in self.children: + text += child.text + return text + + @property + def text_chunks(self, chunk=500): + text_chunks = [] + text_chunk = "" + for p in self.paragraphs: + if chunk < len(text_chunk) + len(p.text): + text_chunks.append(text_chunk) + text_chunk = "" + else: + text_chunk += " " + p.text + if text_chunk and not text_chunk.isspace(): + text_chunks.append(text_chunk) + for child in self.children: + text_chunks += child.text_chunks + return text_chunks + + @property + def blocks(self): + block = {'content': "", 'rank': self.rank, 'level': self.level, 'title': ''} + if self.title: + block['title'] = self.title.text + for p in self.paragraphs: + block['content'] += p.text + '. ' + blocks = [block] + for child in self.children: + blocks += child.blocks + return blocks + + @property + def table_of_contents(self): + toc = [] + if self.title: + toc += [{str(self.level): self.title.text}] + if self.children: + for child in self.children: + toc += child.table_of_contents + return toc + + def move(self, position: int, new_father=None): + current_father = self.father # should be added in the domain + current_father.children.remove(self) + + self.rank = new_father.rank + 1 if new_father else 0 + self.father = new_father + if position < len(new_father.children): + new_father.children.insert(position, self) + else: + new_father.children.append(self) + + def create_children(self, paragraphs, level, rank) -> ([], []): + """ + creates children containers or directly attached content + and returns the list of containers and contents of level+1 + :return: + [Content or Container] + """ + attached_paragraphs = [] + container_paragraphs = [] + container_title = None + children = [] + in_children = False + level = INFINITE + child_id = 0 + + while paragraphs: + p = paragraphs.pop(0) + if not in_children and not p.is_structure: + attached_paragraphs.append(p) + else: + in_children = True + if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy + if container_paragraphs or container_title: + children.append(Container(container_paragraphs, container_title, level, rank, self, child_id)) + child_id += 1 + container_paragraphs = [] + container_title = p + level = p.level + + else: # p is strictly lower in hierarchy + container_paragraphs.append(p) + + if container_paragraphs or container_title: + children.append(Container(container_paragraphs, container_title, level, rank, self, child_id)) + child_id += 1 + + return attached_paragraphs, children + + @property + def structure(self): + + self_structure = {str(self.id_): { + 'index': str(self.id_), + 'canMove': True, + 'isFolder': True, + 'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children], + 'canRename': True, + 'data': {}, + 'level': self.level, + 'rank': self.rank, + 'title': self.title.text if self.title else 'root' + }} + paragraphs_structure = [p.structure for p in self.paragraphs] + structure = [self_structure] + paragraphs_structure + for child in self.children: + structure += child.structure + return structure diff --git a/src/domain/doc.py b/src/domain/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..89f2aafdcd1e44b5d3c52f879e0ea5b043a19160 --- /dev/null +++ b/src/domain/doc.py @@ -0,0 +1,71 @@ +import docx + +from src.domain.container import Container +from src.domain.paragraph import Paragraph +from src.domain.style import Style + + +class Doc: + + def __init__(self, path='', id_=None): + + self.xdoc = docx.Document(path) + self.title = path.split('/')[-1] + self.id_ = id(self) + self.path = path + paragraphs = [Paragraph(xp, self.id_, i) for (i, xp) in enumerate(self.xdoc.paragraphs)] + self.container = Container(paragraphs, father=self) + self.styles = [Style(xs, self.id_, i) for (i, xs) in enumerate(self.xdoc.styles)] + + def save_as_docx(self, path): + self.xdoc.save(path) + + def apply_styles_from(self, ref_doc): + + ref_doc_styles_names = [s.xstyle.name for s in ref_doc.styles] + common_styles = [s for s in self.styles if s.xstyle.name in ref_doc_styles_names] + + for s in common_styles: + s.copy_from(ref_doc.xdoc.styles[s.xstyle.name]) + + @property + def structure(self): + + return self.container.structure + + @property + def blocks(self): + + def from_list_to_str(index_list): + index_str = str(index_list[0]) + for el in index_list[1:]: + index_str += '.' + str(el) + return index_str + + current_index = [] + blocks = [] + for block in self.container.blocks: + block['doc'] = self.title + current_level = len(current_index) + if 0 < block['level']: + if block['level'] == current_level: + current_index[-1] += 1 + elif current_level < block['level']: + current_index.append(1) + elif block['level'] < current_level: + current_index = current_index[:block['level']] + current_index[-1] += 1 + block['paragraph'] = from_list_to_str(current_index) + else: + block['paragraph'] = "0" + blocks.append(block) + return blocks + + + + + + + + + diff --git a/src/domain/paragraph.py b/src/domain/paragraph.py new file mode 100644 index 0000000000000000000000000000000000000000..0f0053f8cb93d2935b6825dcf8b88f7afd4b30a4 --- /dev/null +++ b/src/domain/paragraph.py @@ -0,0 +1,27 @@ +INFINITE = 10000 + + +class Paragraph: + + def __init__(self, xparagraph, doc_id: int, id_: int): + + self.xparagraph = xparagraph + self.id_ = int(str(2)+str(doc_id)+str(id_)) + name = self.xparagraph.style.name + self.level = int(name.split(' ')[-1]) if 'Heading' in name else INFINITE + self.is_structure = self.level < INFINITE + self.text = self.xparagraph.text + + @property + def structure(self): + structure = {str(self.id_): { + 'index': str(self.id_), + 'canMove': True, + 'isFolder': False, + 'children': [], + 'title': self.text, + 'canRename': True, + 'data': {}, + 'level': self.level, + }} + return structure diff --git a/src/domain/project.py b/src/domain/project.py new file mode 100644 index 0000000000000000000000000000000000000000..8425706b3d05184efb44e6f5ed4a8fc438387aee --- /dev/null +++ b/src/domain/project.py @@ -0,0 +1,9 @@ +from src.domain.doc import Doc + + +class Project: + + def __init__(self, name: str, docs: [Doc]): + + self.docs = docs + self.name = name diff --git a/src/domain/style.py b/src/domain/style.py new file mode 100644 index 0000000000000000000000000000000000000000..41c197f070a41c0525232eb433cadfcd4e60f547 --- /dev/null +++ b/src/domain/style.py @@ -0,0 +1,121 @@ +from docx.enum.style import WD_STYLE_TYPE +class Style: + + def __init__(self, xstyle, doc_id, id_): + + self.id_ = int(str(doc_id)+str(id_)) + self.xstyle = xstyle + #self.new_style = self.copy_from + + def copy_from(self, xref): # need to be further developed + + if xref.type == WD_STYLE_TYPE.PARAGRAPH: + self.xstyle.font.size = xref.font.size + self.xstyle.font.color.rgb = xref.font.color.rgb + self.xstyle.font.name = xref.font.name + self.xstyle.font.all_caps = xref.font.all_caps + # Read/write. Causes text in this font to appear in capital letters. + self.xstyle.font.bold = xref.font.bold + # Read/write. Causes text in this font to appear in bold. + self.xstyle.font.complex_script= xref.font.complex_script + # Read/write tri-state value. When True, causes the characters in + # the run to be treated as complex script regardless of their Unicode values. + # "complex script" refers to text written using a complex writing system such as Arabic, Hebrew, Tamil, + # Persian, and others.These scripts require special typesetting and handling because they have different + # writing directions, glyph connections, and letter shape variations. Word provides features that support + # these complex scripts, allowing users to easily create, edit, and format this type of text. + self.xstyle.font.cs_bold = xref.font.cs_bold + # Read/write tri-state value. When True, causes the complex script characters + # in the run to be displayed in bold typeface. + self.xstyle.font.cs_italic = xref.font.cs_italic + # Read/write tri-state value. When True, causes the complex script characters + # in the run to be displayed in italic typeface + self.xstyle.font.double_strike = xref.font.double_strike + # Read/write tri-state value. When True, causes the text in the run to appear with double strikethrough. + self.xstyle.font.emboss = xref.font.emboss + # Read/write tri-state value. When True, causes the text in the run to appear + # as if raised off the page in relief. + self.xstyle.font.hidden = xref.font.hidden + # Read/write tri-state value. When True, causes the text in the run to be hidden from display, + # unless applications settings force hidden text to be shown. + self.xstyle.font.highlight_color = xref.font.highlight_color + # A member of WD_COLOR_INDEX indicating the color of highlighting applied, + # or None if no highlighting is applied. + self.xstyle.font.imprint = xref.font.imprint + # Read/write tri-state value. When True, + # causes the text in the run to appear as if pressed into the page. + self.xstyle.font.italic = xref.font.italic + self.xstyle.font.math = xref.font.math + self.xstyle.font.no_proof = xref.font.no_proof + # Read/write tri-state value. When True, specifies that the contents of this run + # should not report any errors when the document is scanned for spelling and grammar. + self.xstyle.font.outline = xref.font.outline + # Read/write tri-state value. When True causes the characters in the run to appear as if they + # have an outline, by drawing a one pixel wide border around the inside and + # outside borders of each character glyph. + self.xstyle.font.rtl = xref.font.rtl + # Read/write tri-state value. When True causes the text in the + # run to have right-to-left characteristics. + self.xstyle.font.shadow = xref.font.shadow + self.xstyle.font.small_caps = xref.font.small_caps + self.xstyle.font.snap_to_grid = xref.font.snap_to_grid + # Read/write tri-state value. When True causes the run to use the document grid characters per line + # settings defined in the docGrid element when laying out the characters in this run. + # Snap to grid" is a layout feature that helps users align text boxes, images, or other objects precisely + # to a virtual gridline, ensuring consistent spacing and alignment of objects in a document. It improves the + # visual appearance of a document and makes it easier to read and understand. This feature is particularly + # useful for creating large documents such as reports, posters, and flyers, making them look more + # professional, organized, and readable.""" + self.xstyle.font.spec_vanish = xref.font.spec_vanish + # Read/write tri-state value. When True, specifies that the given run shall always behave as if it is + # hidden, even when hidden text is being displayed in the current document. The property has a very narrow, + # specialized use related to the table of contents. + self.xstyle.font.strike = xref.font.strike + # Read/write tri-state value. When True causes the text in the run to appear with a single horizontal line + # through the center of the line. + self.xstyle.font.subscript = xref.font.subscript + # Boolean indicating whether the characters in this Font appear as subscript. None indicates the + # subscript/subscript value is inherited from the style hierarchy. + self.xstyle.font.superscript = xref.font.superscript + self.xstyle.font.underline = xref.font.underline + self.xstyle.font.web_hidden = xref.font.web_hidden + # Using the "Web hidden" property allows us to create multiple versions of a document where some content + # can be hidden, while other content can be displayed publicly. For example, in a resume, you can use the + # "Web hidden" property to hide private information such as phone numbers and addresses. This information + # will only be displayed when an employer chooses to view it. + + self.xstyle.base_style = xref.base_style + # Style object this style inherits from or None if this style is not based on another style. + # self.xstyle.builtin = xref.builtin + self.xstyle.hidden = xref.hidden + # True if display of this style in the style gallery and list of recommended styles is suppressed. + # False otherwise. In order to be shown in the style gallery, this value must be False and quick_style + # must be True. + self.xstyle.locked = xref.locked + # True if this style is locked. not appear in the styles panel or the style gallery and cannot be applied + # to document content + self.xstyle.name = xref.name + self.xstyle.priority = xref.priority + # The integer sort key governing display sequence of this style in the Word UI. None indicates no setting + # is defined, causing Word to use the default value of 0. Style name is used as a secondary sort key to + # resolve ordering of styles having the same priority value. + # In Microsoft Word, "priority" is typically used to describe the importance of markers and comments to + # help authors and editors determine the urgency and priority of the feedback and changes being provided. + # For example, a document may use priority markers such as "high," "medium," "low," etc. + # to indicate issues that need to be addressed with a higher priority. + + self.xstyle.quick_style = xref.quick_style + # True if this style should be displayed in the style gallery when hidden is False. Read/write Boolean. + # for example, Quick Styles can be found in the "Styles" group on the "Home" tab. + # self.xstyle.type = xref.type + self.xstyle.unhide_when_used = xref.unhide_when_used + # True if an application should make this style visible the next time it is applied to content. + # False otherwise. Note that python-docx does not automatically unhide a style having True for this + # attribute when it is applied to content. + + # "unhide_when_used" can refer to a feature in Microsoft Excel. It is a cell format option that allows the + # cell to automatically show when it is being used and hide when it is not being used. This is useful when + # dealing with complex worksheets as it helps users manage and organize data better. When the user needs to + # edit or input data, the cell will automatically show, and once the user has completed the operation, the + # cell will automatically hide to better present the data. + diff --git a/src/domain/user.py b/src/domain/user.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ca1d9aa6448c0c3e8d57bff0fdb4fe828a6d52 --- /dev/null +++ b/src/domain/user.py @@ -0,0 +1,4 @@ +class User: + + def __init__(self, username, ): + self.name = username diff --git a/src/tools/__pycache__/llm.cpython-310.pyc b/src/tools/__pycache__/llm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd6ccfeed211ae1374cf9c8c11d4b78d254454c4 Binary files /dev/null and b/src/tools/__pycache__/llm.cpython-310.pyc differ diff --git a/src/tools/__pycache__/retriever.cpython-310.pyc b/src/tools/__pycache__/retriever.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95e5165b27cd356a7cf88d21f9b99e0b879cf64e Binary files /dev/null and b/src/tools/__pycache__/retriever.cpython-310.pyc differ diff --git a/src/tools/llm.py b/src/tools/llm.py new file mode 100644 index 0000000000000000000000000000000000000000..369fe406bee54dfd618c8a6d5f6c87c0495718e7 --- /dev/null +++ b/src/tools/llm.py @@ -0,0 +1,63 @@ +import os + +from langchain.llms import OpenAI + +OpenAI_KEY = "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" +os.environ["OPENAI_API_KEY"] = OpenAI_KEY +os.environ["TOKENIZERS_PARALLELISM"] = "true" +openai_llm = OpenAI(temperature=0) + + +def generate_paragraph(query: str, context: {}, language='fr') -> str: + """generates the final answer""" + + template = (f" Your task consists in generating a response in {language}\\n" + f" to the following query: ```{query}```\n" + f"\n" + f" Documents provided provided below delimited by triple backticks gives you the context: \n" + f" delimited by triple backticks: ``` {context}``` \n" + f" The response shall be concise and factual\n" + ) + + temp2 = (f" You are an agent designed to answer to the {query} based on the context delimited by triple backticks:\n" + f"``` {context}```\n" + f" The response shall be in {language} and shall be concise and factual\n" + f" In case the provided context does not seem relevant to answer to the question, just return that you " + f" don't know the answer ") + + llm = openai_llm + p = llm(temp2) + + return p + + +def translate(text: str, language="en") -> str: + """translates""" + + languages = "french to english" if language == "en" else "english to french" + + template = (f" Your task consists in translating {languages}\\n" + f" the following text: ```{text}```\n" + f" delimited by by triple backticks" + ) + + llm = openai_llm + p = llm(template) + + return p + + +def generate_paragraph2(query_and_context: str, language: str = 'fr') -> str: + """generates the final answer""" + + temp2 = (f" You are an agent designed to answer based on the query and content given below and delimited by triple" + f" backticks:\n" + f"``` {query_and_context}```\n" + f" The response shall be in {language} and shall be concise and factual\n" + f" In case the provided context does not seem relevant to answer to the question, just return that you " + f" don't know the answer ") + + llm = openai_llm + p = llm(temp2) + + return p diff --git a/src/tools/retriever.py b/src/tools/retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..ea089761ddedcc17da8a14f986fdb8f3977c35fd --- /dev/null +++ b/src/tools/retriever.py @@ -0,0 +1,29 @@ + + +TOKENIZERS_PARALLELISM = True + + +def init_collections(vs_client, doc_path, doc_content, collection_name): + plan_blocks = doc_path.blocks + content_blocks = doc_content.blocks + collection_ = vs_client.create_collection(name=collection_name) + collection_.add( + documents=[b['content'] for b in plan_blocks], + ids=[b['paragraph'] for b in plan_blocks], + metadatas=content_blocks + ) + return collection_ + + +def similarity_search(collection: object, query: str) -> {}: + res = collection.query(query_texts=query) + sources = res['metadatas'][0] + distances = res['distances'][0] + for s, d in zip(sources, distances): + s['distance_f'] = d + s['distance'] = format(d, '.2f') + return sources + + + +