{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "d4e4fe06", "metadata": {}, "outputs": [], "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.document_loaders import PyPDFLoader,DirectoryLoader\n", "from langchain.chains.summarize import load_summarize_chain\n", "from transformers import T5Tokenizer,T5ForConditionalGeneration\n", "from transformers import pipeline\n", "import torch\n", "import base64" ] }, { "cell_type": "code", "execution_count": 3, "id": "6b1f7fb1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\lalit\\anaconda3\\envs\\gen_ai\\Lib\\site-packages\\huggingface_hub\\file_download.py:147: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\lalit\\.cache\\huggingface\\hub\\models--MBZUAI--LaMini-Flan-T5-248M. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n", "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n", "C:\\Users\\lalit\\anaconda3\\envs\\gen_ai\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] } ], "source": [ "MODEL_NAME = 'MBZUAI/LaMini-Flan-T5-248M'\n", "\n", "tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)\n", "base_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,device_map = \"auto\",\n", " torch_dtype = torch.float32)" ] }, { "cell_type": "code", "execution_count": 8, "id": "42bc76bd", "metadata": {}, "outputs": [], "source": [ "def file_preprocessing(path):\n", " loader = PyPDFLoader(path)\n", " pages = loader.load_and_split()\n", " text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200,chunk_overlap = 50)\n", " texts = text_splitter.split_documents(pages)\n", " final_text = \"\"\n", " for text in texts:\n", " final_text += text.page_content\n", " return final_text\n", "\n", "\n", "def llm_pipeline(filepath,Task:str = \"summarization\"):\n", " summarization_pipeline = pipeline(Task,\n", " model = base_model, \n", " tokenizer = tokenizer,\n", " max_length = 500,\n", " min_length = 50)\n", " input_text = file_preprocessing(filepath)\n", " result = summarization_pipeline(input_text)\n", " result = result[0][\"summary_text\"]\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "id": "68ff4603", "metadata": {}, "outputs": [], "source": [ "https://blog.futuresmart.ai/summarizing-documents-made-easy-with-langchain-summarizer" ] }, { "cell_type": "code", "execution_count": 9, "id": "d1792a05", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Infocusp Innovations is an IT firm that specializes in Machine Learning, Artificial Intelligence (AI), Computer Science, Software Engineering, Mobile and Web App Development, QA, and Signal Processing. They have offices in Ahmedabad, Pune, and the United States of America. The role of a Machine Learning Engineer involves designing and deploying advanced machine learning solutions. They possess deep expertise in algorithms, programming languages like Python, and specialized frameworks. They lead complex machine-learning projects, make critical architectural decisions, optimize models for performance, and collaborate with interdisciplinary teams. They contribute to cutting-edge research, develop innovative solutions, and stay at the forefront of the field to deliver impactful and state-of-the-art machine learning applications. Requirements for this role include a B.E., B.S., and Ph.D. in fields related to Computer Science with experience in machine learning, image, and signal processing, or statistics preferred. They are proficient in programming languages such as Tensorflow, Jax, Keras, Scikit-learn, and PyTorch, and have experience in deploying large-scale systems using distributed and cloud-based systems. They work closely with research and product engineering teams to develop next-generation solutions that improve the quality of life of users and enhance the user experience.'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "llm_pipeline(r\"C:\\Users\\lalit\\Downloads\\Machine Learning Engineer.pdf\")" ] }, { "cell_type": "code", "execution_count": null, "id": "677c558a", "metadata": {}, "outputs": [], "source": [ "sammry = " ] }, { "cell_type": "code", "execution_count": null, "id": "17e63c69", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }