File size: 6,811 Bytes
431961d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d4e4fe06",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.document_loaders import PyPDFLoader,DirectoryLoader\n",
    "from langchain.chains.summarize import load_summarize_chain\n",
    "from transformers import T5Tokenizer,T5ForConditionalGeneration\n",
    "from transformers import pipeline\n",
    "import torch\n",
    "import base64"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6b1f7fb1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\lalit\\anaconda3\\envs\\gen_ai\\Lib\\site-packages\\huggingface_hub\\file_download.py:147: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\lalit\\.cache\\huggingface\\hub\\models--MBZUAI--LaMini-Flan-T5-248M. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n",
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n",
      "C:\\Users\\lalit\\anaconda3\\envs\\gen_ai\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "MODEL_NAME = 'MBZUAI/LaMini-Flan-T5-248M'\n",
    "\n",
    "tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)\n",
    "base_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,device_map = \"auto\",\n",
    "                                                       torch_dtype = torch.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "42bc76bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def file_preprocessing(path):\n",
    "    loader = PyPDFLoader(path)\n",
    "    pages = loader.load_and_split()\n",
    "    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200,chunk_overlap = 50)\n",
    "    texts = text_splitter.split_documents(pages)\n",
    "    final_text = \"\"\n",
    "    for text in texts:\n",
    "        final_text += text.page_content\n",
    "    return final_text\n",
    "\n",
    "\n",
    "def llm_pipeline(filepath,Task:str = \"summarization\"):\n",
    "    summarization_pipeline = pipeline(Task,\n",
    "                                     model = base_model, \n",
    "                                     tokenizer = tokenizer,\n",
    "                                     max_length = 500,\n",
    "                                     min_length = 50)\n",
    "    input_text = file_preprocessing(filepath)\n",
    "    result = summarization_pipeline(input_text)\n",
    "    result = result[0][\"summary_text\"]\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68ff4603",
   "metadata": {},
   "outputs": [],
   "source": [
    "https://blog.futuresmart.ai/summarizing-documents-made-easy-with-langchain-summarizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d1792a05",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Infocusp Innovations is an IT firm that specializes in Machine Learning, Artificial Intelligence (AI), Computer Science, Software Engineering, Mobile and Web App Development, QA, and Signal Processing. They have offices in Ahmedabad, Pune, and the United States of America. The role of a Machine Learning Engineer involves designing and deploying advanced machine learning solutions. They possess deep expertise in algorithms, programming languages like Python, and specialized frameworks. They lead complex machine-learning projects, make critical architectural decisions, optimize models for performance, and collaborate with interdisciplinary teams. They contribute to cutting-edge research, develop innovative solutions, and stay at the forefront of the field to deliver impactful and state-of-the-art machine learning applications. Requirements for this role include a B.E., B.S., and Ph.D. in fields related to Computer Science with experience in machine learning, image, and signal processing, or statistics preferred. They are proficient in programming languages such as Tensorflow, Jax, Keras, Scikit-learn, and PyTorch, and have experience in deploying large-scale systems using distributed and cloud-based systems. They work closely with research and product engineering teams to develop next-generation solutions that improve the quality of life of users and enhance the user experience.'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "llm_pipeline(r\"C:\\Users\\lalit\\Downloads\\Machine Learning Engineer.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "677c558a",
   "metadata": {},
   "outputs": [],
   "source": [
    "sammry = "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17e63c69",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}