Spaces:
Sleeping
Sleeping
<html class="writer-html5" lang="en" > | |
<head> | |
<meta charset="utf-8" /> | |
<meta http-equiv="X-UA-Compatible" content="IE=edge" /> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
<link rel="shortcut icon" href="../../img/favicon.ico" /> | |
<title>data - NewsClassifier Docs</title> | |
<link rel="stylesheet" href="../../css/theme.css" /> | |
<link rel="stylesheet" href="../../css/theme_extra.css" /> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" /> | |
<link href="../../assets/_mkdocstrings.css" rel="stylesheet" /> | |
<script> | |
// Current page data | |
var mkdocs_page_name = "data"; | |
var mkdocs_page_input_path = "newsclassifier\\data.md"; | |
var mkdocs_page_url = null; | |
</script> | |
<!--[if lt IE 9]> | |
<script src="../../js/html5shiv.min.js"></script> | |
<![endif]--> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script> | |
<script>hljs.highlightAll();</script> | |
</head> | |
<body class="wy-body-for-nav" role="document"> | |
<div class="wy-grid-for-nav"> | |
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav"> | |
<div class="wy-side-scroll"> | |
<div class="wy-side-nav-search"> | |
<a href="../.." class="icon icon-home"> NewsClassifier Docs | |
</a> | |
</div> | |
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu"> | |
<ul> | |
<li class="toctree-l1"><a class="reference internal" href="../..">Home</a> | |
</li> | |
</ul> | |
<p class="caption"><span class="caption-text">newsclassifier</span></p> | |
<ul class="current"> | |
<li class="toctree-l1"><a class="reference internal" href="../config/">config</a> | |
</li> | |
<li class="toctree-l1 current"><a class="reference internal current" href="./">data</a> | |
<ul class="current"> | |
</ul> | |
</li> | |
<li class="toctree-l1"><a class="reference internal" href="../models/">models</a> | |
</li> | |
<li class="toctree-l1"><a class="reference internal" href="../train/">train</a> | |
</li> | |
<li class="toctree-l1"><a class="reference internal" href="../tune/">tune</a> | |
</li> | |
<li class="toctree-l1"><a class="reference internal" href="../inference/">inference</a> | |
</li> | |
<li class="toctree-l1"><a class="reference internal" href="../utils/">utils</a> | |
</li> | |
</ul> | |
</div> | |
</div> | |
</nav> | |
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> | |
<nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu"> | |
<i data-toggle="wy-nav-top" class="fa fa-bars"></i> | |
<a href="../..">NewsClassifier Docs</a> | |
</nav> | |
<div class="wy-nav-content"> | |
<div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation"> | |
<ul class="wy-breadcrumbs"> | |
<li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li> | |
<li class="breadcrumb-item">newsclassifier</li> | |
<li class="breadcrumb-item active">data</li> | |
<li class="wy-breadcrumbs-aside"> | |
<a href="https://github.com/ManishW315/NewsClassifier/edit/master/docs/newsclassifier/data.md" class="icon icon-github"> Edit on GitHub</a> | |
</li> | |
</ul> | |
<hr/> | |
</div> | |
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> | |
<div class="section" itemprop="articleBody"> | |
<div class="doc doc-object doc-module"> | |
<a id="newsclassifier.data"></a> | |
<div class="doc doc-contents first"> | |
<div class="doc doc-children"> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.clean_text" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">clean_text</span><span class="p">(</span><span class="n">text</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>Clean text (lower, puntuations removal, blank space removal).</p> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">55</span> | |
<span class="normal">56</span> | |
<span class="normal">57</span> | |
<span class="normal">58</span> | |
<span class="normal">59</span> | |
<span class="normal">60</span> | |
<span class="normal">61</span> | |
<span class="normal">62</span> | |
<span class="normal">63</span> | |
<span class="normal">64</span> | |
<span class="normal">65</span> | |
<span class="normal">66</span> | |
<span class="normal">67</span> | |
<span class="normal">68</span> | |
<span class="normal">69</span> | |
<span class="normal">70</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">clean_text</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> | |
<span class="w"> </span><span class="sd">"""Clean text (lower, puntuations removal, blank space removal)."""</span> | |
<span class="c1"># lower case the text</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Cleaning input text."</span><span class="p">)</span> | |
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="c1"># necessary to do before as stopwords are in lower case</span> | |
<span class="c1"># remove stopwords</span> | |
<span class="n">stp_pattern</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"\b("</span> <span class="o">+</span> <span class="sa">r</span><span class="s2">"|"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">STOPWORDS</span><span class="p">)</span> <span class="o">+</span> <span class="sa">r</span><span class="s2">")\b\s*"</span><span class="p">)</span> | |
<span class="n">text</span> <span class="o">=</span> <span class="n">stp_pattern</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">""</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span> | |
<span class="c1"># custom cleaning</span> | |
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="c1"># remove space at start or end if any</span> | |
<span class="n">text</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">" +"</span><span class="p">,</span> <span class="s2">" "</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span> <span class="c1"># remove extra spaces</span> | |
<span class="n">text</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">"[^A-Za-z0-9]+"</span><span class="p">,</span> <span class="s2">" "</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span> <span class="c1"># remove characters that are not alphanumeric</span> | |
<span class="k">return</span> <span class="n">text</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.collate" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">collate</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>Collate and modify the input dictionary to have the same sequence length for a particular input batch.</p> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Parameters:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>inputs</code></b> | |
(<code>dict</code>) | |
– | |
<div class="doc-md-description"> | |
<p>A dictionary containing input tensors with varying sequence lengths.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Returns:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>modified_inputs</code></b>( <code>dict</code> | |
) – | |
<div class="doc-md-description"> | |
<p>A modified dictionary with input tensors trimmed to have the same sequence length.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">175</span> | |
<span class="normal">176</span> | |
<span class="normal">177</span> | |
<span class="normal">178</span> | |
<span class="normal">179</span> | |
<span class="normal">180</span> | |
<span class="normal">181</span> | |
<span class="normal">182</span> | |
<span class="normal">183</span> | |
<span class="normal">184</span> | |
<span class="normal">185</span> | |
<span class="normal">186</span> | |
<span class="normal">187</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">collate</span><span class="p">(</span><span class="n">inputs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">:</span> | |
<span class="w"> </span><span class="sd">"""Collate and modify the input dictionary to have the same sequence length for a particular input batch.</span> | |
<span class="sd"> Args:</span> | |
<span class="sd"> inputs (dict): A dictionary containing input tensors with varying sequence lengths.</span> | |
<span class="sd"> Returns:</span> | |
<span class="sd"> modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.</span> | |
<span class="sd"> """</span> | |
<span class="n">max_len</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">inputs</span><span class="p">[</span><span class="s2">"input_ids"</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">max</span><span class="p">())</span> | |
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">inputs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | |
<span class="n">inputs</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="n">k</span><span class="p">][:,</span> <span class="p">:</span><span class="n">max_len</span><span class="p">]</span> | |
<span class="k">return</span> <span class="n">inputs</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.data_split" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">data_split</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">split_size</span><span class="o">=</span><span class="mf">0.2</span><span class="p">,</span> <span class="n">stratify_on_target</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_dfs</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>Split data into train and test sets.</p> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Parameters:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>df</code></b> | |
(<code><span title="pandas.DataFrame">DataFrame</span></code>) | |
– | |
<div class="doc-md-description"> | |
<p>Data to be split.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>split_size</code></b> | |
(<code>float</code>, default: | |
<code>0.2</code> | |
) | |
– | |
<div class="doc-md-description"> | |
<p>train-test split ratio (test ratio).</p> | |
</div> | |
</li> | |
<li> | |
<b><code>stratify_on_target</code></b> | |
(<code>bool</code>, default: | |
<code>True</code> | |
) | |
– | |
<div class="doc-md-description"> | |
<p>Whether to do stratify split on target.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>target_sep</code></b> | |
(<code>bool</code>) | |
– | |
<div class="doc-md-description"> | |
<p>Whether to do target setting for train and test sets.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>save_dfs</code></b> | |
(<code>bool</code>, default: | |
<code>False</code> | |
) | |
– | |
<div class="doc-md-description"> | |
<p>Whether to save dataset splits in artifacts.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Returns:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
– | |
<div class="doc-md-description"> | |
<p>train-test splits (with/without target setting)</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 99</span> | |
<span class="normal">100</span> | |
<span class="normal">101</span> | |
<span class="normal">102</span> | |
<span class="normal">103</span> | |
<span class="normal">104</span> | |
<span class="normal">105</span> | |
<span class="normal">106</span> | |
<span class="normal">107</span> | |
<span class="normal">108</span> | |
<span class="normal">109</span> | |
<span class="normal">110</span> | |
<span class="normal">111</span> | |
<span class="normal">112</span> | |
<span class="normal">113</span> | |
<span class="normal">114</span> | |
<span class="normal">115</span> | |
<span class="normal">116</span> | |
<span class="normal">117</span> | |
<span class="normal">118</span> | |
<span class="normal">119</span> | |
<span class="normal">120</span> | |
<span class="normal">121</span> | |
<span class="normal">122</span> | |
<span class="normal">123</span> | |
<span class="normal">124</span> | |
<span class="normal">125</span> | |
<span class="normal">126</span> | |
<span class="normal">127</span> | |
<span class="normal">128</span> | |
<span class="normal">129</span> | |
<span class="normal">130</span> | |
<span class="normal">131</span> | |
<span class="normal">132</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">data_split</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">split_size</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.2</span><span class="p">,</span> <span class="n">stratify_on_target</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">save_dfs</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span> | |
<span class="w"> </span><span class="sd">"""Split data into train and test sets.</span> | |
<span class="sd"> Args:</span> | |
<span class="sd"> df (pd.DataFrame): Data to be split.</span> | |
<span class="sd"> split_size (float): train-test split ratio (test ratio).</span> | |
<span class="sd"> stratify_on_target (bool): Whether to do stratify split on target.</span> | |
<span class="sd"> target_sep (bool): Whether to do target setting for train and test sets.</span> | |
<span class="sd"> save_dfs (bool): Whether to save dataset splits in artifacts.</span> | |
<span class="sd"> Returns:</span> | |
<span class="sd"> train-test splits (with/without target setting)</span> | |
<span class="sd"> """</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Splitting Data."</span><span class="p">)</span> | |
<span class="k">try</span><span class="p">:</span> | |
<span class="k">if</span> <span class="n">stratify_on_target</span><span class="p">:</span> | |
<span class="n">stra</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">"Category"</span><span class="p">]</span> | |
<span class="k">else</span><span class="p">:</span> | |
<span class="n">stra</span> <span class="o">=</span> <span class="kc">None</span> | |
<span class="n">train</span><span class="p">,</span> <span class="n">test</span> <span class="o">=</span> <span class="n">train_test_split</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">test_size</span><span class="o">=</span><span class="n">split_size</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">42</span><span class="p">,</span> <span class="n">stratify</span><span class="o">=</span><span class="n">stra</span><span class="p">)</span> | |
<span class="n">train_ds</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">train</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> | |
<span class="n">test_ds</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">test</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> | |
<span class="k">if</span> <span class="n">save_dfs</span><span class="p">:</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Saving and storing data splits."</span><span class="p">)</span> | |
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">preprocessed_data_path</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | |
<span class="n">train</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">preprocessed_data_path</span><span class="p">,</span> <span class="s2">"train.csv"</span><span class="p">))</span> | |
<span class="n">test</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">Cfg</span><span class="o">.</span><span class="n">preprocessed_data_path</span><span class="p">,</span> <span class="s2">"test.csv"</span><span class="p">))</span> | |
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span> | |
<span class="k">return</span> <span class="n">train_ds</span><span class="p">,</span> <span class="n">test_ds</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.load_dataset" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">load_dataset</span><span class="p">(</span><span class="n">filepath</span><span class="p">,</span> <span class="n">print_i</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>load data from source into a Pandas DataFrame.</p> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Parameters:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>filepath</code></b> | |
(<code>str</code>) | |
– | |
<div class="doc-md-description"> | |
<p>file location.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>print_i</code></b> | |
(<code>int</code>, default: | |
<code>0</code> | |
) | |
– | |
<div class="doc-md-description"> | |
<p>Print number of instances.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Returns:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<code><span title="pandas.DataFrame">DataFrame</span></code> | |
– | |
<div class="doc-md-description"> | |
<p>pd.DataFrame: Pandas DataFrame of the data.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">17</span> | |
<span class="normal">18</span> | |
<span class="normal">19</span> | |
<span class="normal">20</span> | |
<span class="normal">21</span> | |
<span class="normal">22</span> | |
<span class="normal">23</span> | |
<span class="normal">24</span> | |
<span class="normal">25</span> | |
<span class="normal">26</span> | |
<span class="normal">27</span> | |
<span class="normal">28</span> | |
<span class="normal">29</span> | |
<span class="normal">30</span> | |
<span class="normal">31</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_dataset</span><span class="p">(</span><span class="n">filepath</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">print_i</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> | |
<span class="w"> </span><span class="sd">"""load data from source into a Pandas DataFrame.</span> | |
<span class="sd"> Args:</span> | |
<span class="sd"> filepath (str): file location.</span> | |
<span class="sd"> print_i (int): Print number of instances.</span> | |
<span class="sd"> Returns:</span> | |
<span class="sd"> pd.DataFrame: Pandas DataFrame of the data.</span> | |
<span class="sd"> """</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Loading Data."</span><span class="p">)</span> | |
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">filepath</span><span class="p">)</span> | |
<span class="k">if</span> <span class="n">print_i</span><span class="p">:</span> | |
<span class="nb">print</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">print_i</span><span class="p">),</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="p">)</span> | |
<span class="k">return</span> <span class="n">df</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.prepare_data" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">prepare_data</span><span class="p">(</span><span class="n">df</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>Separate headlines instance and feature selection.</p> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Parameters:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>df</code></b> | |
(<code><span title="pandas.DataFrame">DataFrame</span></code>) | |
– | |
<div class="doc-md-description"> | |
<p>original dataframe.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Returns:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>df</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code> | |
) – | |
<div class="doc-md-description"> | |
<p>new dataframe with appropriate features.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>headlines_df</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code> | |
) – | |
<div class="doc-md-description"> | |
<p>dataframe cintaining "headlines" category instances.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">34</span> | |
<span class="normal">35</span> | |
<span class="normal">36</span> | |
<span class="normal">37</span> | |
<span class="normal">38</span> | |
<span class="normal">39</span> | |
<span class="normal">40</span> | |
<span class="normal">41</span> | |
<span class="normal">42</span> | |
<span class="normal">43</span> | |
<span class="normal">44</span> | |
<span class="normal">45</span> | |
<span class="normal">46</span> | |
<span class="normal">47</span> | |
<span class="normal">48</span> | |
<span class="normal">49</span> | |
<span class="normal">50</span> | |
<span class="normal">51</span> | |
<span class="normal">52</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">prepare_data</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span> | |
<span class="w"> </span><span class="sd">"""Separate headlines instance and feature selection.</span> | |
<span class="sd"> Args:</span> | |
<span class="sd"> df: original dataframe.</span> | |
<span class="sd"> Returns:</span> | |
<span class="sd"> df: new dataframe with appropriate features.</span> | |
<span class="sd"> headlines_df: dataframe cintaining "headlines" category instances.</span> | |
<span class="sd"> """</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Preparing Data."</span><span class="p">)</span> | |
<span class="k">try</span><span class="p">:</span> | |
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[[</span><span class="s2">"Title"</span><span class="p">,</span> <span class="s2">"Category"</span><span class="p">]]</span> | |
<span class="n">df</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s2">"Title"</span><span class="p">:</span> <span class="s2">"Text"</span><span class="p">},</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | |
<span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="p">[</span><span class="s2">"Category"</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">"Headlines"</span><span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span> <span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="p">[</span><span class="s2">"Category"</span><span class="p">]</span> <span class="o">==</span> <span class="s2">"Headlines"</span><span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | |
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span> | |
<span class="k">return</span> <span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.prepare_input" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">prepare_input</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>Tokenize and prepare the input text using the provided tokenizer.</p> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Parameters:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>tokenizer</code></b> | |
(<code><span title="transformers.RobertaTokenizer">RobertaTokenizer</span></code>) | |
– | |
<div class="doc-md-description"> | |
<p>The Roberta tokenizer to encode the input.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>text</code></b> | |
(<code>str</code>) | |
– | |
<div class="doc-md-description"> | |
<p>The input text to be tokenized.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Returns:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>inputs</code></b>( <code>dict</code> | |
) – | |
<div class="doc-md-description"> | |
<p>A dictionary containing the tokenized input with keys such as 'input_ids', | |
'attention_mask', etc.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">135</span> | |
<span class="normal">136</span> | |
<span class="normal">137</span> | |
<span class="normal">138</span> | |
<span class="normal">139</span> | |
<span class="normal">140</span> | |
<span class="normal">141</span> | |
<span class="normal">142</span> | |
<span class="normal">143</span> | |
<span class="normal">144</span> | |
<span class="normal">145</span> | |
<span class="normal">146</span> | |
<span class="normal">147</span> | |
<span class="normal">148</span> | |
<span class="normal">149</span> | |
<span class="normal">150</span> | |
<span class="normal">151</span> | |
<span class="normal">152</span> | |
<span class="normal">153</span> | |
<span class="normal">154</span> | |
<span class="normal">155</span> | |
<span class="normal">156</span> | |
<span class="normal">157</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">prepare_input</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">:</span> <span class="n">RobertaTokenizer</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">:</span> | |
<span class="w"> </span><span class="sd">"""Tokenize and prepare the input text using the provided tokenizer.</span> | |
<span class="sd"> Args:</span> | |
<span class="sd"> tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.</span> | |
<span class="sd"> text (str): The input text to be tokenized.</span> | |
<span class="sd"> Returns:</span> | |
<span class="sd"> inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',</span> | |
<span class="sd"> 'attention_mask', etc.</span> | |
<span class="sd"> """</span> | |
<span class="n">logger</span><span class="p">(</span><span class="s2">"Tokenizing input text."</span><span class="p">)</span> | |
<span class="n">inputs</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">encode_plus</span><span class="p">(</span> | |
<span class="n">text</span><span class="p">,</span> | |
<span class="n">return_tensors</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> | |
<span class="n">add_special_tokens</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">add_special_tokens</span><span class="p">,</span> | |
<span class="n">max_length</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">max_len</span><span class="p">,</span> | |
<span class="n">pad_to_max_length</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">pad_to_max_length</span><span class="p">,</span> | |
<span class="n">truncation</span><span class="o">=</span><span class="n">Cfg</span><span class="o">.</span><span class="n">truncation</span><span class="p">,</span> | |
<span class="p">)</span> | |
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">inputs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | |
<span class="n">inputs</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">long</span><span class="p">)</span> | |
<span class="k">return</span> <span class="n">inputs</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
<div class="doc doc-object doc-function"> | |
<h2 id="newsclassifier.data.preprocess" class="doc doc-heading"> | |
<code class="highlight language-python"><span class="n">preprocess</span><span class="p">(</span><span class="n">df</span><span class="p">)</span></code> | |
</h2> | |
<div class="doc doc-contents "> | |
<p>Preprocess the data.</p> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Parameters:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>df</code></b> | |
(<code><span title="pandas.DataFrame">DataFrame</span></code>) | |
– | |
<div class="doc-md-description"> | |
<p>Dataframe on which the preprocessing steps need to be performed.</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table class="field-list"> | |
<colgroup> | |
<col class="field-name" /> | |
<col class="field-body" /> | |
</colgroup> | |
<tbody valign="top"> | |
<tr class="field"> | |
<th class="field-name">Returns:</th> | |
<td class="field-body"> | |
<ul class="first simple"> | |
<li> | |
<b><code>df</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code> | |
) – | |
<div class="doc-md-description"> | |
<p>Preprocessed Data.</p> | |
</div> | |
</li> | |
<li> | |
<b><code>class_to_index</code></b>( <code><span title="pandas.DataFrame">DataFrame</span></code> | |
) – | |
<div class="doc-md-description"> | |
<p>class labels to indices mapping</p> | |
</div> | |
</li> | |
<li> | |
<b><code>class_to_index</code></b>( <code><span title="typing.Dict">Dict</span></code> | |
) – | |
<div class="doc-md-description"> | |
<p>indices to class labels mapping</p> | |
</div> | |
</li> | |
</ul> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<details class="quote"> | |
<summary> <code>newsclassifier\data.py</code></summary> | |
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">73</span> | |
<span class="normal">74</span> | |
<span class="normal">75</span> | |
<span class="normal">76</span> | |
<span class="normal">77</span> | |
<span class="normal">78</span> | |
<span class="normal">79</span> | |
<span class="normal">80</span> | |
<span class="normal">81</span> | |
<span class="normal">82</span> | |
<span class="normal">83</span> | |
<span class="normal">84</span> | |
<span class="normal">85</span> | |
<span class="normal">86</span> | |
<span class="normal">87</span> | |
<span class="normal">88</span> | |
<span class="normal">89</span> | |
<span class="normal">90</span> | |
<span class="normal">91</span> | |
<span class="normal">92</span> | |
<span class="normal">93</span> | |
<span class="normal">94</span> | |
<span class="normal">95</span> | |
<span class="normal">96</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">preprocess</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]:</span> | |
<span class="w"> </span><span class="sd">"""Preprocess the data.</span> | |
<span class="sd"> Args:</span> | |
<span class="sd"> df: Dataframe on which the preprocessing steps need to be performed.</span> | |
<span class="sd"> Returns:</span> | |
<span class="sd"> df: Preprocessed Data.</span> | |
<span class="sd"> class_to_index: class labels to indices mapping</span> | |
<span class="sd"> class_to_index: indices to class labels mapping</span> | |
<span class="sd"> """</span> | |
<span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span> <span class="o">=</span> <span class="n">prepare_data</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> | |
<span class="n">cats</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">"Category"</span><span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> | |
<span class="n">class_to_index</span> <span class="o">=</span> <span class="p">{</span><span class="n">tag</span><span class="p">:</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">tag</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cats</span><span class="p">)}</span> | |
<span class="n">index_to_class</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">k</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">class_to_index</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> | |
<span class="n">df</span><span class="p">[</span><span class="s2">"Text"</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">"Text"</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">clean_text</span><span class="p">)</span> <span class="c1"># clean text</span> | |
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[[</span><span class="s2">"Text"</span><span class="p">,</span> <span class="s2">"Category"</span><span class="p">]]</span> | |
<span class="k">try</span><span class="p">:</span> | |
<span class="n">df</span><span class="p">[</span><span class="s2">"Category"</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s2">"Category"</span><span class="p">]</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">class_to_index</span><span class="p">)</span> <span class="c1"># label encoding</span> | |
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span> | |
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span> | |
<span class="k">return</span> <span class="n">df</span><span class="p">,</span> <span class="n">headlines_df</span><span class="p">,</span> <span class="n">class_to_index</span><span class="p">,</span> <span class="n">index_to_class</span> | |
</code></pre></div></td></tr></table></div> | |
</details> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div><footer> | |
<div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation"> | |
<a href="../config/" class="btn btn-neutral float-left" title="config"><span class="icon icon-circle-arrow-left"></span> Previous</a> | |
<a href="../models/" class="btn btn-neutral float-right" title="models">Next <span class="icon icon-circle-arrow-right"></span></a> | |
</div> | |
<hr/> | |
<div role="contentinfo"> | |
<!-- Copyright etc --> | |
</div> | |
Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. | |
</footer> | |
</div> | |
</div> | |
</section> | |
</div> | |
<div class="rst-versions" role="note" aria-label="Versions"> | |
<span class="rst-current-version" data-toggle="rst-current-version"> | |
<span> | |
<a href="https://github.com/ManishW315/NewsClassifier" class="fa fa-github" style="color: #fcfcfc"> GitHub</a> | |
</span> | |
<span><a href="../config/" style="color: #fcfcfc">« Previous</a></span> | |
<span><a href="../models/" style="color: #fcfcfc">Next »</a></span> | |
</span> | |
</div> | |
<script src="../../js/jquery-3.6.0.min.js"></script> | |
<script>var base_url = "../..";</script> | |
<script src="../../js/theme_extra.js"></script> | |
<script src="../../js/theme.js"></script> | |
<script> | |
jQuery(function () { | |
SphinxRtdTheme.Navigation.enable(true); | |
}); | |
</script> | |
</body> | |
</html> | |