File size: 4,793 Bytes
0d74625
 
 
 
497146b
 
 
 
0d74625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe6a0c9
0d74625
 
fe6a0c9
0d74625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe6a0c9
0d74625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "marimo",
#     "plotly.express",
#     "plotly==6.0.1",
#     "duckdb==1.2.1",
#     "sqlglot==26.11.1",
# ]
# ///


import marimo

__generated_with = "0.11.17"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo 
    import plotly.express as px
    return mo, px


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""#Loading CSVs with DuckDB""")
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""

        <p> I remember when I first learnt about DuckDB, it was a gamechanger - I used to load the data I wanted to work on to a database software like MS SQL Server, and then build a bridge to an IDE with the language I wanted to use like Python, or R; it was quite the hassle. DuckDB changed my whole world - now I could just import the data file into the IDE, or notebook, make a duckdb connection, and there we go! But then, I realized I didn't even need the step of first importing the file using python. I could just query the csv file directly using SQL through a DuckDB connection.</p> 



        ##Introduction

        <p> I found this dataset on the evolution of AI research by disclipine from <a href= "https://oecd.ai/en/data?selectedArea=ai-research&selectedVisualization=16731"> OECD </a>, and it piqued my interest. I feel like publications in natural language processing drastically jumped in the mid 2010s, and I'm excited to find out if that's the case. </p> 



        <p> In this notebook, we'll: </p>

        <ul>

            <li> Import the CSV file into the notebook</li>

            <li> Create another table within the database based on the CSV</li>

            <li> Dig into publications on natural language processing have evolved over the years</li>

        </ul>

        """
    )
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""##Load the CSV""")
    return


@app.cell
def _(mo):
    _df = mo.sql(
        f"""

        /* Another way to load the CSV could be 

        SELECT * 

        FROM read_csv('https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv')

        */

        SELECT * 

        FROM "https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv"

        LIMIT 5;

        """
    )
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""##Create Another Table""")
    return


@app.cell
def _(mo):
    Discipline_Analysis = mo.sql(
        f"""

        -- Build a table based on the CSV where it just contains the specified columns

        CREATE TABLE Domain_Analysis AS

            SELECT Year, Concept, publications FROM "https://github.com/Mustjaab/Loading_CSVs_in_DuckDB/blob/main/AI_Research_Data.csv"

        """
    )
    return Discipline_Analysis, Domain_Analysis


@app.cell
def _(Domain_Analysis, mo):
    Analysis = mo.sql(
        f"""

        SELECT * 

        FROM Domain_Analysis

        GROUP BY Concept, Year, publications

        ORDER BY Year

        """
    )
    return (Analysis,)


@app.cell
def _(Domain_Analysis, mo):
    _df = mo.sql(
        f"""

        SELECT 

            AVG(CASE WHEN Year < 2020 THEN publications END) AS avg_pre_2020,

            AVG(CASE WHEN Year >= 2020 THEN publications END) AS avg_2020_onward

        FROM Domain_Analysis

        WHERE Concept = 'Natural language processing';

        """
    )
    return


@app.cell
def _(Domain_Analysis, mo):
    NLP_Analysis = mo.sql(
        f"""

        SELECT 

            publications, 

            CASE 

                WHEN Year < 2020 THEN 'Pre-2020' 

                ELSE '2020-onward' 

            END AS period

        FROM Domain_Analysis

        WHERE Year >= 2000 

        AND Concept = 'Natural language processing';

        """,
        output=False
    )
    return (NLP_Analysis,)


@app.cell
def _(NLP_Analysis, px):
    px.box(NLP_Analysis,x=1,y=0,color = 1)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""<p> We can see there's a significant increase in NLP publications 2020 and onwards which definitely makes sense provided the rapid emergence of commercial large langage models, and AI assistants. </p>""")
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""

        ##Conclusion

        <p> In this notebook, we learned how to:</p> 

        <ul>

            <li> Load a CSV into DuckDB </li>

            <li> Create other tables using the imported CSV </li>

            <li> Seamlessly analyze and visualize data between SQL, and Python cells</li>

        </ul>

        """
    )
    return


if __name__ == "__main__":
    app.run()