ManishW's picture
Upload folder using huggingface_hub
022acf4
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link rel="shortcut icon" href="../../img/favicon.ico" />
<title>data - NewsClassifier Docs</title>
<link rel="stylesheet" href="../../css/theme.css" />
<link rel="stylesheet" href="../../css/theme_extra.css" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
<link href="../../assets/_mkdocstrings.css" rel="stylesheet" />
<script>
// Current page data
var mkdocs_page_name = "data";
var mkdocs_page_input_path = "newsclassifier\\data.md";
var mkdocs_page_url = null;
</script>
<!--[if lt IE 9]>
<script src="../../js/html5shiv.min.js"></script>
<![endif]-->
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
<script>hljs.highlightAll();</script>
</head>
<body class="wy-body-for-nav" role="document">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
<div class="wy-side-scroll">
<div class="wy-side-nav-search">
<a href="../.." class="icon icon-home"> NewsClassifier Docs
</a>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<ul>
<li class="toctree-l1"><a class="reference internal" href="../..">Home</a>
</li>
</ul>
<p class="caption"><span class="caption-text">newsclassifier</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../config/">config</a>
</li>
<li class="toctree-l1 current"><a class="reference internal current" href="./">data</a>
<ul class="current">
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../models/">models</a>
</li>
<li class="toctree-l1"><a class="reference internal" href="../train/">train</a>
</li>
<li class="toctree-l1"><a class="reference internal" href="../tune/">tune</a>
</li>
<li class="toctree-l1"><a class="reference internal" href="../inference/">inference</a>
</li>
<li class="toctree-l1"><a class="reference internal" href="../utils/">utils</a>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../..">NewsClassifier Docs</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
<li class="breadcrumb-item">newsclassifier</li>
<li class="breadcrumb-item active">data</li>
<li class="wy-breadcrumbs-aside">
<a href="https://github.com/ManishW315/NewsClassifier/edit/master/docs/newsclassifier/data.md" class="icon icon-github"> Edit on GitHub</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div class="section" itemprop="articleBody">
<div class="doc doc-object doc-module">
<a id="newsclassifier.data"></a>
<div class="doc doc-contents first">
<div class="doc doc-children">
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.clean_text" class="doc doc-heading">
<code class="highlight language-python"><span class="n">clean_text</span><span class="p">(</span><span class="n">text</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Clean text (lower, puntuations removal, blank space removal).</p>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">55</span>
<span class="normal">56</span>
<span class="normal">57</span>
<span class="normal">58</span>
<span class="normal">59</span>
<span class="normal">60</span>
<span class="normal">61</span>
<span class="normal">62</span>
<span class="normal">63</span>
<span class="normal">64</span>
<span class="normal">65</span>
<span class="normal">66</span>
<span class="normal">67</span>
<span class="normal">68</span>
<span class="normal">69</span>
<span class="normal">70</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">clean_text</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Clean text (lower, puntuations removal, blank space removal).&quot;&quot;&quot;</span>
<span class="c1"># lower case the text</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Cleaning input text.&quot;</span><span class="p">)</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="c1"># necessary to do before as stopwords are in lower case</span>
<span class="c1"># remove stopwords</span>
<span class="n">stp_pattern</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;\b(&quot;</span> <span class="o">+</span> <span class="sa">r</span><span class="s2">&quot;|&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">STOPWORDS</span><span class="p">)</span> <span class="o">+</span> <span class="sa">r</span><span class="s2">&quot;)\b\s*&quot;</span><span class="p">)</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">stp_pattern</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span>
<span class="c1"># custom cleaning</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="c1"># remove space at start or end if any</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">&quot; +&quot;</span><span class="p">,</span> <span class="s2">&quot; &quot;</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span> <span class="c1"># remove extra spaces</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">&quot;[^A-Za-z0-9]+&quot;</span><span class="p">,</span> <span class="s2">&quot; &quot;</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span> <span class="c1"># remove characters that are not alphanumeric</span>
<span class="k">return</span> <span class="n">text</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.collate" class="doc doc-heading">
<code class="highlight language-python"><span class="n">collate</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Collate and modify the input dictionary to have the same sequence length for a particular input batch.</p>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Parameters:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>inputs</code></b>
(<code>dict</code>)
<div class="doc-md-description">
<p>A dictionary containing input tensors with varying sequence lengths.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Returns:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>modified_inputs</code></b>( <code>dict</code>
) –
<div class="doc-md-description">
<p>A modified dictionary with input tensors trimmed to have the same sequence length.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">175</span>
<span class="normal">176</span>
<span class="normal">177</span>
<span class="normal">178</span>
<span class="normal">179</span>
<span class="normal">180</span>
<span class="normal">181</span>
<span class="normal">182</span>
<span class="normal">183</span>
<span class="normal">184</span>
<span class="normal">185</span>
<span class="normal">186</span>
<span class="normal">187</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">collate</span><span class="p">(</span><span class="n">inputs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Collate and modify the input dictionary to have the same sequence length for a particular input batch.</span>
<span class="sd"> Args:</span>
<span class="sd"> inputs (dict): A dictionary containing input tensors with varying sequence lengths.</span>
<span class="sd"> Returns:</span>
<span class="sd"> modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">max_len</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">inputs</span><span class="p">[</span><span class="s2">&quot;input_ids&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">inputs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">inputs</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="n">k</span><span class="p">][:,</span> <span class="p">:</span><span class="n">max_len</span><span class="p">]</span>
<span class="k">return</span> <span class="n">inputs</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.data_split" class="doc doc-heading">
<code class="highlight language-python"><span class="n">data_split</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">split_size</span><span class="o">=</span><span class="mf">0.2</span><span class="p">,</span> <span class="n">stratify_on_target</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_dfs</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Split data into train and test sets.</p>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Parameters:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>df</code></b>
(<code><span title="pandas.DataFrame">DataFrame</span></code>)
<div class="doc-md-description">
<p>Data to be split.</p>
</div>
</li>
<li>
<b><code>split_size</code></b>
(<code>float</code>, default:
<code>0.2</code>
)
<div class="doc-md-description">
<p>train-test split ratio (test ratio).</p>
</div>
</li>
<li>
<b><code>stratify_on_target</code></b>
(<code>bool</code>, default:
<code>True</code>
)
<div class="doc-md-description">
<p>Whether to do stratify split on target.</p>
</div>
</li>
<li>
<b><code>target_sep</code></b>
(<code>bool</code>)
<div class="doc-md-description">
<p>Whether to do target setting for train and test sets.</p>
</div>
</li>
<li>
<b><code>save_dfs</code></b>
(<code>bool</code>, default:
<code>False</code>
)
<div class="doc-md-description">
<p>Whether to save dataset splits in artifacts.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Returns:</th>
<td class="field-body">
<ul class="first simple">
<li>
<div class="doc-md-description">
<p>train-test splits (with/without target setting)</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 99</span>
<span class="normal">100</span>
<span class="normal">101</span>
<span class="normal">102</span>
<span class="normal">103</span>
<span class="normal">104</span>
<span class="normal">105</span>
<span class="normal">106</span>
<span class="normal">107</span>
<span class="normal">108</span>
<span class="normal">109</span>
<span class="normal">110</span>
<span class="normal">111</span>
<span class="normal">112</span>
<span class="normal">113</span>
<span class="normal">114</span>
<span class="normal">115</span>
<span class="normal">116</span>
<span class="normal">117</span>
<span class="normal">118</span>
<span class="normal">119</span>
<span class="normal">120</span>
<span class="normal">121</span>
<span class="normal">122</span>
<span class="normal">123</span>
<span class="normal">124</span>
<span class="normal">125</span>
<span class="normal">126</span>
<span class="normal">127</span>
<span class="normal">128</span>
<span class="normal">129</span>
<span class="normal">130</span>
<span class="normal">131</span>
<span class="normal">132</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">data_split</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">split_size</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">,</span> <span class="n">stratify_on_target</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">save_dfs</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Split data into train and test sets.</span>
<span class="sd"> Args:</span>
<span class="sd"> df (pd.DataFrame): Data to be split.</span>
<span class="sd"> split_size (float): train-test split ratio (test ratio).</span>
<span class="sd"> stratify_on_target (bool): Whether to do stratify split on target.</span>
<span class="sd"> target_sep (bool): Whether to do target setting for train and test sets.</span>
<span class="sd"> save_dfs (bool): Whether to save dataset splits in artifacts.</span>
<span class="sd"> Returns:</span>
<span class="sd"> train-test splits (with/without target setting)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Splitting Data.&quot;</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">if</span> <span class="n">stratify_on_target</span><span class="p">:</span>
<span class="n">stra</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;Category&quot;</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">stra</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">train</span><span class="p">,</span> <span class="n">test</span> <span class="o">=</span> <span class="n">train_test_split</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">test_size</span><span class="o">=</span><span class="n">split_size</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">42</span><span class="p">,</span> <span class="n">stratify</span><span class="o">=</span><span class="n">stra</span><span class="p">)</span>
<span class="n">train_ds</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">train</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="n">test_ds</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">test</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">if</span> <span class="n">save_dfs</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Saving and storing data splits.&quot;</span><span class="p">)</span>
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">preprocessed_data_path</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">train</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">preprocessed_data_path</span><span class="p">,</span> <span class="s2">&quot;train.csv&quot;</span><span class="p">))</span>
<span class="n">test</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">preprocessed_data_path</span><span class="p">,</span> <span class="s2">&quot;test.csv&quot;</span><span class="p">))</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
<span class="k">return</span> <span class="n">train_ds</span><span class="p">,</span> <span class="n">test_ds</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.load_dataset" class="doc doc-heading">
<code class="highlight language-python"><span class="n">load_dataset</span><span class="p">(</span><span class="n">filepath</span><span class="p">,</span> <span class="n">print_i</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>load data from source into a Pandas DataFrame.</p>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Parameters:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>filepath</code></b>
(<code>str</code>)
<div class="doc-md-description">
<p>file location.</p>
</div>
</li>
<li>
<b><code>print_i</code></b>
(<code>int</code>, default:
<code>0</code>
)
<div class="doc-md-description">
<p>Print number of instances.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Returns:</th>
<td class="field-body">
<ul class="first simple">
<li>
<code><span title="pandas.DataFrame">DataFrame</span></code>
<div class="doc-md-description">
<p>pd.DataFrame: Pandas DataFrame of the data.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">17</span>
<span class="normal">18</span>
<span class="normal">19</span>
<span class="normal">20</span>
<span class="normal">21</span>
<span class="normal">22</span>
<span class="normal">23</span>
<span class="normal">24</span>
<span class="normal">25</span>
<span class="normal">26</span>
<span class="normal">27</span>
<span class="normal">28</span>
<span class="normal">29</span>
<span class="normal">30</span>
<span class="normal">31</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_dataset</span><span class="p">(</span><span class="n">filepath</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">print_i</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;load data from source into a Pandas DataFrame.</span>
<span class="sd"> Args:</span>
<span class="sd"> filepath (str): file location.</span>
<span class="sd"> print_i (int): Print number of instances.</span>
<span class="sd"> Returns:</span>
<span class="sd"> pd.DataFrame: Pandas DataFrame of the data.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Loading Data.&quot;</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">filepath</span><span class="p">)</span>
<span class="k">if</span> <span class="n">print_i</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">print_i</span><span class="p">),</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">df</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.prepare_data" class="doc doc-heading">
<code class="highlight language-python"><span class="n">prepare_data</span><span class="p">(</span><span class="n">df</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Separate headlines instance and feature selection.</p>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Parameters:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>df</code></b>
(<code><span title="pandas.DataFrame">DataFrame</span></code>)
<div class="doc-md-description">
<p>original dataframe.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Returns:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>df</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code>
) –
<div class="doc-md-description">
<p>new dataframe with appropriate features.</p>
</div>
</li>
<li>
<b><code>headlines_df</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code>
) –
<div class="doc-md-description">
<p>dataframe cintaining "headlines" category instances.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">34</span>
<span class="normal">35</span>
<span class="normal">36</span>
<span class="normal">37</span>
<span class="normal">38</span>
<span class="normal">39</span>
<span class="normal">40</span>
<span class="normal">41</span>
<span class="normal">42</span>
<span class="normal">43</span>
<span class="normal">44</span>
<span class="normal">45</span>
<span class="normal">46</span>
<span class="normal">47</span>
<span class="normal">48</span>
<span class="normal">49</span>
<span class="normal">50</span>
<span class="normal">51</span>
<span class="normal">52</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">prepare_data</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Separate headlines instance and feature selection.</span>
<span class="sd"> Args:</span>
<span class="sd"> df: original dataframe.</span>
<span class="sd"> Returns:</span>
<span class="sd"> df: new dataframe with appropriate features.</span>
<span class="sd"> headlines_df: dataframe cintaining &quot;headlines&quot; category instances.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Preparing Data.&quot;</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[[</span><span class="s2">&quot;Title&quot;</span><span class="p">,</span> <span class="s2">&quot;Category&quot;</span><span class="p">]]</span>
<span class="n">df</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;Title&quot;</span><span class="p">:</span> <span class="s2">&quot;Text&quot;</span><span class="p">},</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Category&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">&quot;Headlines&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span> <span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="p">[</span><span class="s2">&quot;Category&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;Headlines&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
<span class="k">return</span> <span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.prepare_input" class="doc doc-heading">
<code class="highlight language-python"><span class="n">prepare_input</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Tokenize and prepare the input text using the provided tokenizer.</p>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Parameters:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>tokenizer</code></b>
(<code><span title="transformers.RobertaTokenizer">RobertaTokenizer</span></code>)
<div class="doc-md-description">
<p>The Roberta tokenizer to encode the input.</p>
</div>
</li>
<li>
<b><code>text</code></b>
(<code>str</code>)
<div class="doc-md-description">
<p>The input text to be tokenized.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Returns:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>inputs</code></b>( <code>dict</code>
) –
<div class="doc-md-description">
<p>A dictionary containing the tokenized input with keys such as 'input_ids',
'attention_mask', etc.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">135</span>
<span class="normal">136</span>
<span class="normal">137</span>
<span class="normal">138</span>
<span class="normal">139</span>
<span class="normal">140</span>
<span class="normal">141</span>
<span class="normal">142</span>
<span class="normal">143</span>
<span class="normal">144</span>
<span class="normal">145</span>
<span class="normal">146</span>
<span class="normal">147</span>
<span class="normal">148</span>
<span class="normal">149</span>
<span class="normal">150</span>
<span class="normal">151</span>
<span class="normal">152</span>
<span class="normal">153</span>
<span class="normal">154</span>
<span class="normal">155</span>
<span class="normal">156</span>
<span class="normal">157</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">prepare_input</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">:</span> <span class="n">RobertaTokenizer</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Tokenize and prepare the input text using the provided tokenizer.</span>
<span class="sd"> Args:</span>
<span class="sd"> tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.</span>
<span class="sd"> text (str): The input text to be tokenized.</span>
<span class="sd"> Returns:</span>
<span class="sd"> inputs (dict): A dictionary containing the tokenized input with keys such as &#39;input_ids&#39;,</span>
<span class="sd"> &#39;attention_mask&#39;, etc.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">logger</span><span class="p">(</span><span class="s2">&quot;Tokenizing input text.&quot;</span><span class="p">)</span>
<span class="n">inputs</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">encode_plus</span><span class="p">(</span>
<span class="n">text</span><span class="p">,</span>
<span class="n">return_tensors</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">add_special_tokens</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">add_special_tokens</span><span class="p">,</span>
<span class="n">max_length</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">max_len</span><span class="p">,</span>
<span class="n">pad_to_max_length</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">pad_to_max_length</span><span class="p">,</span>
<span class="n">truncation</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">truncation</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">inputs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">inputs</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">long</span><span class="p">)</span>
<span class="k">return</span> <span class="n">inputs</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="newsclassifier.data.preprocess" class="doc doc-heading">
<code class="highlight language-python"><span class="n">preprocess</span><span class="p">(</span><span class="n">df</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Preprocess the data.</p>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Parameters:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>df</code></b>
(<code><span title="pandas.DataFrame">DataFrame</span></code>)
<div class="doc-md-description">
<p>Dataframe on which the preprocessing steps need to be performed.</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<table class="field-list">
<colgroup>
<col class="field-name" />
<col class="field-body" />
</colgroup>
<tbody valign="top">
<tr class="field">
<th class="field-name">Returns:</th>
<td class="field-body">
<ul class="first simple">
<li>
<b><code>df</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code>
) –
<div class="doc-md-description">
<p>Preprocessed Data.</p>
</div>
</li>
<li>
<b><code>class_to_index</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code>
) –
<div class="doc-md-description">
<p>class labels to indices mapping</p>
</div>
</li>
<li>
<b><code>class_to_index</code></b>( <code><span title="typing.Dict">Dict</span></code>
) –
<div class="doc-md-description">
<p>indices to class labels mapping</p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<details class="quote">
<summary> <code>newsclassifier\data.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">73</span>
<span class="normal">74</span>
<span class="normal">75</span>
<span class="normal">76</span>
<span class="normal">77</span>
<span class="normal">78</span>
<span class="normal">79</span>
<span class="normal">80</span>
<span class="normal">81</span>
<span class="normal">82</span>
<span class="normal">83</span>
<span class="normal">84</span>
<span class="normal">85</span>
<span class="normal">86</span>
<span class="normal">87</span>
<span class="normal">88</span>
<span class="normal">89</span>
<span class="normal">90</span>
<span class="normal">91</span>
<span class="normal">92</span>
<span class="normal">93</span>
<span class="normal">94</span>
<span class="normal">95</span>
<span class="normal">96</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">preprocess</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Preprocess the data.</span>
<span class="sd"> Args:</span>
<span class="sd"> df: Dataframe on which the preprocessing steps need to be performed.</span>
<span class="sd"> Returns:</span>
<span class="sd"> df: Preprocessed Data.</span>
<span class="sd"> class_to_index: class labels to indices mapping</span>
<span class="sd"> class_to_index: indices to class labels mapping</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span> <span class="o">=</span> <span class="n">prepare_data</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="n">cats</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;Category&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="n">class_to_index</span> <span class="o">=</span> <span class="p">{</span><span class="n">tag</span><span class="p">:</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">tag</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cats</span><span class="p">)}</span>
<span class="n">index_to_class</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">k</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">class_to_index</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
<span class="n">df</span><span class="p">[</span><span class="s2">&quot;Text&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;Text&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">clean_text</span><span class="p">)</span> <span class="c1"># clean text</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[[</span><span class="s2">&quot;Text&quot;</span><span class="p">,</span> <span class="s2">&quot;Category&quot;</span><span class="p">]]</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">df</span><span class="p">[</span><span class="s2">&quot;Category&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">&quot;Category&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">class_to_index</span><span class="p">)</span> <span class="c1"># label encoding</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
<span class="k">return</span> <span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span><span class="p">,</span> <span class="n">class_to_index</span><span class="p">,</span> <span class="n">index_to_class</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
</div>
</div>
</div>
</div>
</div><footer>
<div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
<a href="../config/" class="btn btn-neutral float-left" title="config"><span class="icon icon-circle-arrow-left"></span> Previous</a>
<a href="../models/" class="btn btn-neutral float-right" title="models">Next <span class="icon icon-circle-arrow-right"></span></a>
</div>
<hr/>
<div role="contentinfo">
<!-- Copyright etc -->
</div>
Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<div class="rst-versions" role="note" aria-label="Versions">
<span class="rst-current-version" data-toggle="rst-current-version">
<span>
<a href="https://github.com/ManishW315/NewsClassifier" class="fa fa-github" style="color: #fcfcfc"> GitHub</a>
</span>
<span><a href="../config/" style="color: #fcfcfc">&laquo; Previous</a></span>
<span><a href="../models/" style="color: #fcfcfc">Next &raquo;</a></span>
</span>
</div>
<script src="../../js/jquery-3.6.0.min.js"></script>
<script>var base_url = "../..";</script>
<script src="../../js/theme_extra.js"></script>
<script src="../../js/theme.js"></script>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>