bsenst commited on
Commit
2dfb157
·
1 Parent(s): 4c464b0

update low code and use cases, add files

Browse files
src/03_low_code/app_market_scraping/app_market_scraping.qmd CHANGED
@@ -1,7 +1,12 @@
1
  ---
2
  title: "App-Market-Scraping"
3
- description: "Ein Tool zur Extraktion und Analyse von App-Marktdaten, einschließlich benutzerdefinierter Suchparameter und Datenexport."
4
  image: _2f0cb788-71a6-4817-ab94-d38c346e4f6f.jpeg
 
 
 
 
 
5
  ---
6
 
7
  ## Lernziele
@@ -17,7 +22,7 @@ Um Apps zu sammeln, besuchen Sie die [Google Play Search](../../02_basics/app_ma
17
 
18
  ### 1. Installation des Google Play Scrapers
19
 
20
- Um den Google Play Scraper zu installieren, den folgenden Befehl verwenden:
21
 
22
  ```python
23
  !pip install google-play-scraper
@@ -85,3 +90,5 @@ plt.show()
85
  ## Fazit
86
 
87
  Diese Schritte ermöglichen die Installation des Google Play Scrapers, das Einlesen einer CSV-Datei mit App-URLs, das Abrufen von App-Informationen und die Visualisierung der Daten.
 
 
 
1
  ---
2
  title: "App-Market-Scraping"
3
+ description: "Extraktion und Analyse von App-Marktdaten, einschließlich benutzerdefinierter Suchparameter und Datenexport."
4
  image: _2f0cb788-71a6-4817-ab94-d38c346e4f6f.jpeg
5
+ format:
6
+ html:
7
+ toc: true
8
+ code-tools: true
9
+ jupyter: python3
10
  ---
11
 
12
  ## Lernziele
 
22
 
23
  ### 1. Installation des Google Play Scrapers
24
 
25
+ In einem Colab Notebook, installiere die Google-Play-Scraper Bibliothek mit dem folgenden Befehl:
26
 
27
  ```python
28
  !pip install google-play-scraper
 
90
  ## Fazit
91
 
92
  Diese Schritte ermöglichen die Installation des Google Play Scrapers, das Einlesen einer CSV-Datei mit App-URLs, das Abrufen von App-Informationen und die Visualisierung der Daten.
93
+
94
+ {{< downloadthis ../../assets/App_Market_Scraping.ipynb dname="App_Market_Scraping" label="Download Notebook Beispiel" icon="journal-code" type="success" >}}
src/03_low_code/catalogue/bookstoscrape.qmd CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: "Web Scraping mit Python: Bücher von Books to Scrape"
3
  description: "Eine Anleitung zum Scraping von Büchern von der Website Books to Scrape, einschließlich Python-Beispielen und Datenexport."
4
  image: _be1bcdc2-f540-4a95-a27c-775e8f2c1c07.jpeg
5
  format:
 
1
  ---
2
+ title: "Bücherkatalog scrapen"
3
  description: "Eine Anleitung zum Scraping von Büchern von der Website Books to Scrape, einschließlich Python-Beispielen und Datenexport."
4
  image: _be1bcdc2-f540-4a95-a27c-775e8f2c1c07.jpeg
5
  format:
src/03_low_code/video_transcripts/social-media.qmd CHANGED
@@ -120,6 +120,7 @@ print(profile)
120
  Web Scraping von Social-Media-Daten ist technisch möglich, aber nur unter strikter Beachtung der rechtlichen und ethischen Vorgaben vertretbar. Die Nutzung offizieller APIs ist der empfohlene Weg, da sie den Zugriff kontrolliert und im Einklang mit den Plattformrichtlinien erlaubt.
121
 
122
  Bevor Scraping-Projekte umgesetzt werden:
 
123
  - **Prüfe die Nutzungsbedingungen der Plattform.**
124
  - **Respektiere die Privatsphäre und Rechte der Nutzer.**
125
  - **Nutze offizielle APIs, wo immer möglich.**
 
120
  Web Scraping von Social-Media-Daten ist technisch möglich, aber nur unter strikter Beachtung der rechtlichen und ethischen Vorgaben vertretbar. Die Nutzung offizieller APIs ist der empfohlene Weg, da sie den Zugriff kontrolliert und im Einklang mit den Plattformrichtlinien erlaubt.
121
 
122
  Bevor Scraping-Projekte umgesetzt werden:
123
+
124
  - **Prüfe die Nutzungsbedingungen der Plattform.**
125
  - **Respektiere die Privatsphäre und Rechte der Nutzer.**
126
  - **Nutze offizielle APIs, wo immer möglich.**
src/04_use_case/forum/buergergeld_forum.ipynb CHANGED
@@ -87,6 +87,13 @@
87
  "# download_pages(base_url, start_page, end_page, output_directory)"
88
  ]
89
  },
 
 
 
 
 
 
 
90
  {
91
  "cell_type": "markdown",
92
  "metadata": {},
 
87
  "# download_pages(base_url, start_page, end_page, output_directory)"
88
  ]
89
  },
90
+ {
91
+ "cell_type": "markdown",
92
+ "metadata": {},
93
+ "source": [
94
+ "Um das wiederholte Abfragen gleicher Inhalte zu vermeiden, können die gesammelten Seiten als Ordner heruntergeladen werden: [buergergeld_forum.zip](https://huggingface.co/spaces/datenwerkzeuge/CDL-Webscraping-Workshop-2025/blob/main/src/assets/buergergeld_forum.zip)"
95
+ ]
96
+ },
97
  {
98
  "cell_type": "markdown",
99
  "metadata": {},
src/_extensions/shafayetShafee/downloadthis/_extension.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ title: Downloadthis
2
+ author: Shafayet Khan Shafee
3
+ version: 1.1.0
4
+ quarto-required: ">=1.2.0"
5
+ contributes:
6
+ shortcodes:
7
+ - downloadthis.lua
8
+
src/_extensions/shafayetShafee/downloadthis/downloadthis.lua ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --[[
2
+ MIT License
3
+
4
+ Copyright (c) 2023 Shafayet Khan Shafee
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ ]]--
24
+
25
+
26
+
27
+ local str = pandoc.utils.stringify
28
+ --local p = quarto.log.output
29
+
30
+ local function ensureHtmlDeps()
31
+ quarto.doc.add_html_dependency({
32
+ name = "downloadthis",
33
+ version = "1.9.1",
34
+ stylesheets = {"resources/css/downloadthis.css"}
35
+ })
36
+ end
37
+
38
+ local function optional(arg, default)
39
+ if arg == nil or arg == ""
40
+ then
41
+ return default
42
+ else
43
+ return arg
44
+ end
45
+ end
46
+
47
+ function import(script)
48
+ local path = PANDOC_SCRIPT_FILE:match("(.*[/\\])")
49
+ package.path = path .. script .. ";" .. package.path
50
+ return require(script)
51
+ end
52
+
53
+ local puremagic = import("puremagic.lua")
54
+
55
+ return {
56
+ ['downloadthis'] = function(args, kwargs, meta)
57
+
58
+ -- args and kwargs
59
+ local file_path = str(args[1])
60
+ local extension = "." .. file_path:match("[^.]+$")
61
+ local dname = optional(str(kwargs["dname"]), "file")
62
+ local dfilename = dname .. extension
63
+ local btn_label = " " .. optional(str(kwargs["label"]), "Download") .. " "
64
+ local btn_type = optional(str(kwargs["type"]), "default")
65
+ local icon = optional(str(kwargs["icon"]), "download")
66
+ local class = " " .. optional(str(kwargs["class"]), "")
67
+ local rand = "dnldts" .. str(math.random(1, 65000))
68
+ local id = optional(str(kwargs["id"]), rand)
69
+ -- reading files
70
+ local fh = io.open(file_path, "rb")
71
+ if not fh then
72
+ io.stderr:write("Cannot open file " ..
73
+ file_path ..
74
+ " | Skipping adding buttons\n")
75
+ return pandoc.Null()
76
+ else
77
+ local contents = fh:read("*all")
78
+ fh:close()
79
+
80
+ -- creating dataURI object
81
+ local b64_encoded = quarto.base64.encode(contents)
82
+ local mimetype = puremagic.via_path(file_path)
83
+ local data_uri = 'data:' .. mimetype .. ";base64," .. b64_encoded
84
+
85
+ -- js code taken from
86
+ -- https://github.com/fmmattioni/downloadthis/blob/master/R/utils.R#L59
87
+ local js = [[fetch('%s').then(res => res.blob()).then(blob => {
88
+ const downloadURL = window.URL.createObjectURL(blob);
89
+ const a = document.createElement('a');
90
+ document.body.appendChild(a);
91
+ a.href = downloadURL;
92
+ a.download = '%s'; a.click();
93
+ window.URL.revokeObjectURL(downloadURL);
94
+ document.body.removeChild(a);
95
+ });]]
96
+
97
+ local clicked = js:format(data_uri, dfilename)
98
+
99
+ -- creating button
100
+ local button =
101
+ "<button class=\"btn btn-" .. btn_type .. " downloadthis " ..
102
+ class .. "\"" ..
103
+ " id=\"" .. id .. "\"" ..
104
+ "><i class=\"bi bi-" .. icon .. "\"" .. "></i>" ..
105
+ btn_label ..
106
+ "</button>"
107
+ if quarto.doc.is_format("html:js") and quarto.doc.has_bootstrap()
108
+ then
109
+ ensureHtmlDeps()
110
+ return pandoc.RawInline('html',
111
+ "<a href=\"#" .. id .. "\"" ..
112
+ " onclick=\"" .. clicked .. "\">" .. button .. "</a>"
113
+ )
114
+ else
115
+ return pandoc.Null()
116
+ end
117
+ end
118
+ end
119
+ }
120
+
121
+
src/_extensions/shafayetShafee/downloadthis/puremagic.lua ADDED
@@ -0,0 +1,735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- puremagic 1.0.1
2
+ -- Copyright (c) 2014 Will Bond <[email protected]>
3
+ -- Licensed under the MIT license.
4
+
5
+
6
+ function basename(path)
7
+ local basename_match = path:match('[/\\]([^/\\]+)$')
8
+ if basename_match then
9
+ return basename_match, nil
10
+ end
11
+
12
+ return path, nil
13
+ end
14
+
15
+
16
+ function extension(path)
17
+ path = path:lower()
18
+ local tar_match = path:match('%.(tar%.[^.]+)$')
19
+ if tar_match then
20
+ return tar_match
21
+ end
22
+ if path:sub(#path - 11, #path) == '.numbers.zip' then
23
+ return 'numbers.zip'
24
+ end
25
+ if path:sub(#path - 9, #path) == '.pages.zip' then
26
+ return 'pages.zip'
27
+ end
28
+ if path:sub(#path - 7, #path) == '.key.zip' then
29
+ return 'key.zip'
30
+ end
31
+ return path:match('%.([^.]+)$')
32
+ end
33
+
34
+
35
+ function in_table(value, list)
36
+ for i=1, #list do
37
+ if list[i] == value then
38
+ return true
39
+ end
40
+ end
41
+ return false
42
+ end
43
+
44
+
45
+ function string_to_bit_table(chars)
46
+ local output = {}
47
+ for char in chars:gmatch('.') do
48
+ local num = string.byte(char)
49
+ local bits = {0, 0, 0, 0, 0, 0, 0, 0}
50
+ for bit=8, 1, -1 do
51
+ if num > 0 then
52
+ bits[bit] = math.fmod(num, 2)
53
+ num = (num - bits[bit]) / 2
54
+ end
55
+ end
56
+ table.insert(output, bits)
57
+ end
58
+ return output
59
+ end
60
+
61
+
62
+ function bit_table_to_string(bits)
63
+ local output = {}
64
+ for i = 1, #bits do
65
+ local num = tonumber(table.concat(bits[i]), 2)
66
+ table.insert(output, string.format('%c', num))
67
+ end
68
+ return table.concat(output)
69
+ end
70
+
71
+
72
+ function bitwise_and(a, b)
73
+ local a_bytes = string_to_bit_table(a)
74
+ local b_bytes = string_to_bit_table(b)
75
+
76
+ local output = {}
77
+ for i = 1, #a_bytes do
78
+ local bits = {0, 0, 0, 0, 0, 0, 0, 0}
79
+ for j = 1, 8 do
80
+ if a_bytes[i][j] == 1 and b_bytes[i][j] == 1 then
81
+ bits[j] = 1
82
+ else
83
+ bits[j] = 0
84
+ end
85
+ end
86
+ table.insert(output, bits)
87
+ end
88
+
89
+ return bit_table_to_string(output)
90
+ end
91
+
92
+
93
+ -- Unpack a little endian byte string into an integer
94
+ function unpack_le(chars)
95
+ local bit_table = string_to_bit_table(chars)
96
+ -- Merge the bits into a string of 1s and 0s
97
+ local result = {}
98
+ for i=1, #bit_table do
99
+ result[#chars + 1 - i] = table.concat(bit_table[i])
100
+ end
101
+ return tonumber(table.concat(result), 2)
102
+ end
103
+
104
+
105
+ -- Unpack a big endian byte string into an integer
106
+ function unpack_be(chars)
107
+ local bit_table = string_to_bit_table(chars)
108
+ -- Merge the bits into a string of 1s and 0s
109
+ for i=1, #bit_table do
110
+ bit_table[i] = table.concat(bit_table[i])
111
+ end
112
+ return tonumber(table.concat(bit_table), 2)
113
+ end
114
+
115
+
116
+ -- Takes the first 4-8k of an EBML file and identifies if it is matroska or webm
117
+ -- and it it contains just video or just audio.
118
+ function ebml_parse(content)
119
+ local position = 1
120
+ local length = #content
121
+
122
+ local header_token, header_value, used_bytes = ebml_parse_section(content)
123
+ position = position + used_bytes
124
+
125
+
126
+ if header_token ~= '\x1AE\xDF\xA3' then
127
+ return nil, 'Unable to find EBML ID'
128
+ end
129
+
130
+ -- The matroska spec sets the default doctype to be 'matroska', however
131
+ -- many file specify this anyway. The other option is 'webm'.
132
+ local doctype = 'matroska'
133
+ if header_value['B\x82'] then
134
+ doctype = header_value['B\x82']
135
+ end
136
+
137
+ if doctype ~= 'matroska' and doctype ~= 'webm' then
138
+ return nil, 'Unknown EBML doctype'
139
+ end
140
+
141
+ local segment_position = nil
142
+ local track_position = nil
143
+ local has_video = false
144
+ local found_tracks = false
145
+
146
+ while position <= length do
147
+ local ebml_id, ebml_value, used_bytes = ebml_parse_section(content:sub(position, length))
148
+ position = position + used_bytes
149
+
150
+ -- Segment
151
+ if ebml_id == '\x18S\x80g' then
152
+ segment_position = position
153
+ end
154
+
155
+ -- Meta seek information
156
+ if ebml_id == '\x11M\x9Bt' then
157
+ -- Look for the seek info about the tracks token
158
+ for i, child in ipairs(ebml_value['M\xBB']) do
159
+ if child['S\xAB'] == '\x16T\xAEk' then
160
+ track_position = segment_position + unpack_be(child['S\xAC'])
161
+ position = track_position
162
+ break
163
+ end
164
+ end
165
+ end
166
+
167
+ -- Track
168
+ if ebml_id == '\x16T\xAEk' then
169
+ found_tracks = true
170
+ -- Scan through each track looking for video
171
+ for i, child in ipairs(ebml_value['\xAE']) do
172
+ -- Look to see if the track type is video
173
+ if unpack_be(child['\x83']) == 1 then
174
+ has_video = true
175
+ break
176
+ end
177
+ end
178
+ break
179
+ end
180
+ end
181
+
182
+ if found_tracks and not has_video then
183
+ if doctype == 'matroska' then
184
+ return 'audio/x-matroska'
185
+ else
186
+ return 'audio/webm'
187
+ end
188
+ end
189
+
190
+ if doctype == 'matroska' then
191
+ return 'video/x-matroska'
192
+ else
193
+ return 'video/webm'
194
+ end
195
+ end
196
+
197
+
198
+ -- Parses a section of an EBML document, returning the EBML ID at the beginning,
199
+ -- plus the value as a table with child EBML IDs as keys and the number of
200
+ -- bytes from the content that contained the ID and value
201
+ function ebml_parse_section(content)
202
+ local ebml_id, element_length, used_bytes = ebml_id_and_length(content)
203
+
204
+ -- Don't parse the segment since it is the whole file!
205
+ if ebml_id == '\x18\x53\x80\x67' then
206
+ return ebml_id, nil, used_bytes
207
+ end
208
+
209
+ local ebml_value = content:sub(used_bytes + 1, used_bytes + element_length)
210
+ used_bytes = used_bytes + element_length
211
+
212
+ -- We always parse the return value of level 0/1 elements
213
+ local recursive_parse = false
214
+ if #ebml_id == 4 then
215
+ recursive_parse = true
216
+
217
+ -- We need Seek information
218
+ elseif ebml_id == '\x4D\xBB' then
219
+ recursive_parse = true
220
+
221
+ -- We want the top-level of TrackEntry to grab the TrackType
222
+ elseif ebml_id == '\xAE' then
223
+ recursive_parse = true
224
+ end
225
+
226
+ if recursive_parse then
227
+ local buffer = ebml_value
228
+ ebml_value = {}
229
+
230
+ -- Track which child entries have been converted to an array
231
+ local array_children = {}
232
+
233
+ while #buffer > 0 do
234
+ local child_ebml_id, child_ebml_value, child_used_bytes = ebml_parse_section(buffer)
235
+
236
+ if array_children[child_ebml_id] then
237
+ table.insert(ebml_value[child_ebml_id], child_ebml_value)
238
+
239
+ -- Single values are just stores by themselves
240
+ elseif ebml_value[child_ebml_id] == nil then
241
+ -- Force seek info and tracks to be arrays even if there is only one
242
+ if child_ebml_id == 'M\xBB' or child_ebml_id == '\xAE' then
243
+ child_ebml_value = {child_ebml_value}
244
+ array_children[child_ebml_id] = true
245
+ end
246
+ ebml_value[child_ebml_id] = child_ebml_value
247
+
248
+ -- If there is already a value for the ID, turn it into a table
249
+ else
250
+ ebml_value[child_ebml_id] = {ebml_value[child_ebml_id], child_ebml_value}
251
+ array_children[child_ebml_id] = true
252
+ end
253
+
254
+ -- Move past the part we've parsed
255
+ buffer = buffer:sub(child_used_bytes + 1, #buffer)
256
+ end
257
+ end
258
+
259
+ return ebml_id, ebml_value, used_bytes
260
+ end
261
+
262
+
263
+ -- Should accept 12+ bytes, will return the ebml id, the data length and the
264
+ -- number of bytes that were used to hold those values.
265
+ function ebml_id_and_length(chars)
266
+ -- The ID is encoded the same way as the length, however, we don't want
267
+ -- to remove the length bits from the ID value or intepret it as an
268
+ -- unsigned int since all of the documentation online references the IDs in
269
+ -- encoded form.
270
+ local _, id_length = ebml_length(chars:sub(1, 4))
271
+ local ebml_id = chars:sub(1, id_length)
272
+
273
+ local remaining = chars:sub(id_length + 1, id_length + 8)
274
+ local element_length, used_bytes = ebml_length(remaining)
275
+
276
+ return ebml_id, element_length, id_length + used_bytes
277
+ end
278
+
279
+
280
+ -- Should accept 8+ bytes, will return the data length plus the number of bytes
281
+ -- that were used to hold the data length.
282
+ function ebml_length(chars)
283
+ -- We substring chars to ensure we don't build a huge table we don't need
284
+ local bit_tables = string_to_bit_table(chars:sub(1, 8))
285
+
286
+ local value_length = 1
287
+ for i=1, #bit_tables[1] do
288
+ if bit_tables[1][i] == 0 then
289
+ value_length = value_length + 1
290
+ else
291
+ -- Clear the indicator bit so the rest of the byte
292
+ bit_tables[1][i] = 0
293
+ break
294
+ end
295
+ end
296
+
297
+ local bits = {}
298
+ for i=1, value_length do
299
+ table.insert(bits, table.concat(bit_tables[i]))
300
+ end
301
+
302
+ return tonumber(table.concat(bits), 2), value_length
303
+ end
304
+
305
+
306
+ function binary_tests(content, ext)
307
+ local length = #content
308
+ local _1_8 = content:sub(1, 8)
309
+ local _1_7 = content:sub(1, 7)
310
+ local _1_6 = content:sub(1, 6)
311
+ local _1_5 = content:sub(1, 5)
312
+ local _1_4 = content:sub(1, 4)
313
+ local _1_3 = content:sub(1, 3)
314
+ local _1_2 = content:sub(1, 2)
315
+ local _9_12 = content:sub(9, 12)
316
+
317
+
318
+ -- Images
319
+ if _1_4 == '\xC5\xD0\xD3\xC6' then
320
+ -- With a Windows-format EPS, the file starts right after a 30-byte
321
+ -- header, or a 30-byte header followed by two bytes of padding
322
+ if content:sub(33, 42) == '%!PS-Adobe' or content:sub(31, 40) == '%!PS-Adobe' then
323
+ return 'application/postscript'
324
+ end
325
+ end
326
+
327
+ if _1_8 == '%!PS-Ado' and content:sub(9, 10) == 'be' then
328
+ return 'application/postscript'
329
+ end
330
+
331
+ if _1_4 == 'MM\x00*' or _1_4 == 'II*\x00' then
332
+ return 'image/tiff'
333
+ end
334
+
335
+ if _1_8 == '\x89PNG\r\n\x1A\n' then
336
+ return 'image/png'
337
+ end
338
+
339
+ if _1_6 == 'GIF87a' or _1_6 == 'GIF89a' then
340
+ return 'image/gif'
341
+ end
342
+
343
+ if _1_4 == 'RIFF' and _9_12 == 'WEBP' then
344
+ return 'image/webp'
345
+ end
346
+
347
+ if _1_2 == 'BM' and length > 14 and in_table(content:sub(15, 15), {'\x0C', '(', '@', '\x80'}) then
348
+ return 'image/x-ms-bmp'
349
+ end
350
+
351
+ local normal_jpeg = length > 10 and in_table(content:sub(7, 10), {'JFIF', 'Exif'})
352
+ local photoshop_jpeg = length > 24 and _1_4 == '\xFF\xD8\xFF\xED' and content:sub(21, 24) == '8BIM'
353
+ if normal_jpeg or photoshop_jpeg then
354
+ return 'image/jpeg'
355
+ end
356
+
357
+ if _1_4 == '8BPS' then
358
+ return 'image/vnd.adobe.photoshop'
359
+ end
360
+
361
+ if _1_8 == '\x00\x00\x00\x0CjP ' and _9_12 == '\r\n\x87\n' then
362
+ return 'image/jp2'
363
+ end
364
+
365
+ if _1_4 == '\x00\x00\x01\x00' then
366
+ return 'application/vnd.microsoft.icon'
367
+ end
368
+
369
+
370
+ -- Audio/Video
371
+ if _1_4 == '\x1AE\xDF\xA3' and length > 1000 then
372
+ local mimetype, err = ebml_parse(content)
373
+
374
+ if mimetype then
375
+ return mimetype
376
+ end
377
+ end
378
+
379
+ if _1_4 == 'MOVI' then
380
+ if in_table(content:sub(5, 8), {'moov', 'mdat'}) then
381
+ return 'video/quicktime'
382
+ end
383
+ end
384
+
385
+ if length > 8 and content:sub(5, 8) == 'ftyp' then
386
+ local lower_9_12 = _9_12:lower()
387
+
388
+ if in_table(lower_9_12, {'avc1', 'isom', 'iso2', 'mp41', 'mp42', 'mmp4', 'ndsc', 'ndsh', 'ndsm', 'ndsp', 'ndss', 'ndxc', 'ndxh', 'ndxm', 'ndxp', 'ndxs', 'f4v ', 'f4p ', 'm4v '}) then
389
+ return 'video/mp4'
390
+ end
391
+
392
+ if in_table(lower_9_12, {'msnv', 'ndas', 'f4a ', 'f4b ', 'm4a ', 'm4b ', 'm4p '}) then
393
+ return 'audio/mp4'
394
+ end
395
+
396
+ if in_table(lower_9_12, {'3g2a', '3g2b', '3g2c', 'kddi'}) then
397
+ return 'video/3gpp2'
398
+ end
399
+
400
+ if in_table(lower_9_12, {'3ge6', '3ge7', '3gg6', '3gp1', '3gp2', '3gp3', '3gp4', '3gp5', '3gp6', '3gs7'}) then
401
+ return 'video/3gpp'
402
+ end
403
+
404
+ if lower_9_12 == 'mqt ' or lower_9_12 == 'qt ' then
405
+ return 'video/quicktime'
406
+ end
407
+
408
+ if lower_9_12 == 'jp2 ' then
409
+ return 'image/jp2'
410
+ end
411
+ end
412
+
413
+ -- MP3
414
+ if bitwise_and(_1_2, '\xFF\xF6') == '\xFF\xF2' then
415
+ local byte_3 = content:sub(3, 3)
416
+ if bitwise_and(byte_3, '\xF0') ~= '\xF0' and bitwise_and(byte_3, "\x0C") ~= "\x0C" then
417
+ return 'audio/mpeg'
418
+ end
419
+ end
420
+ if _1_3 == 'ID3' then
421
+ return 'audio/mpeg'
422
+ end
423
+
424
+ if _1_4 == 'fLaC' then
425
+ return 'audio/x-flac'
426
+ end
427
+
428
+ if _1_8 == '0&\xB2u\x8Ef\xCF\x11' then
429
+ -- Without writing a full-on ASF parser, we can just scan for the
430
+ -- UTF-16 string "AspectRatio"
431
+ if content:find('\x00A\x00s\x00p\x00e\x00c\x00t\x00R\x00a\x00t\x00i\x00o', 1, true) then
432
+ return 'video/x-ms-wmv'
433
+ end
434
+ return 'audio/x-ms-wma'
435
+ end
436
+
437
+ if _1_4 == 'RIFF' and _9_12 == 'AVI ' then
438
+ return 'video/x-msvideo'
439
+ end
440
+
441
+ if _1_4 == 'RIFF' and _9_12 == 'WAVE' then
442
+ return 'audio/x-wav'
443
+ end
444
+
445
+ if _1_4 == 'FORM' and _9_12 == 'AIFF' then
446
+ return 'audio/x-aiff'
447
+ end
448
+
449
+ if _1_4 == 'OggS' then
450
+ local _29_33 = content:sub(29, 33)
451
+ if _29_33 == '\x01vorb' then
452
+ return 'audio/vorbis'
453
+ end
454
+ if _29_33 == '\x07FLAC' then
455
+ return 'audio/x-flac'
456
+ end
457
+ if _29_33 == 'OpusH' then
458
+ return 'audio/ogg'
459
+ end
460
+ -- Theora and OGM
461
+ if _29_33 == '\x80theo' or _29_33 == 'vide' then
462
+ return 'video/ogg'
463
+ end
464
+ end
465
+
466
+ if _1_3 == 'FWS' or _1_3 == 'CWS' then
467
+ return 'application/x-shockwave-flash'
468
+ end
469
+
470
+ if _1_3 == 'FLV' then
471
+ return 'video/x-flv'
472
+ end
473
+
474
+
475
+ if _1_5 == '%PDF-' then
476
+ return 'application/pdf'
477
+ end
478
+
479
+ if _1_5 == '{\\rtf' then
480
+ return 'text/rtf'
481
+ end
482
+
483
+
484
+ -- Office '97-2003 formats
485
+ if _1_8 == '\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' then
486
+ if in_table(ext, {'xls', 'csv', 'tab'}) then
487
+ return 'application/vnd.ms-excel'
488
+ end
489
+ if ext == 'ppt' then
490
+ return 'application/vnd.ms-powerpoint'
491
+ end
492
+ -- We default to word since we need something if the extension isn't recognized
493
+ return 'application/msword'
494
+ end
495
+
496
+ if _1_8 == '\x09\x04\x06\x00\x00\x00\x10\x00' then
497
+ return 'application/vnd.ms-excel'
498
+ end
499
+
500
+ if _1_6 == '\xDB\xA5\x2D\x00\x00\x00' or _1_5 == '\x50\x4F\x5E\x51\x60' or _1_4 == '\xFE\x37\x00\x23' or _1_3 == '\x94\xA6\x2E' then
501
+ return 'application/msword'
502
+ end
503
+
504
+ if _1_4 == 'PK\x03\x04' then
505
+ -- Office XML formats
506
+ if ext == 'xlsx' then
507
+ return 'application/vnd.ms-excel'
508
+ end
509
+
510
+ if ext == 'pptx' then
511
+ return 'application/vnd.ms-powerpoint'
512
+ end
513
+
514
+ if ext == 'docx' then
515
+ return 'application/msword'
516
+ end
517
+
518
+ -- Open Office formats
519
+ if ext == 'ods' then
520
+ return 'application/vnd.oasis.opendocument.spreadsheet'
521
+ end
522
+
523
+ if ext == 'odp' then
524
+ return 'application/vnd.oasis.opendocument.presentation'
525
+ end
526
+
527
+ if ext == 'odt' then
528
+ return 'application/vnd.oasis.opendocument.text'
529
+ end
530
+
531
+ -- iWork - some programs like Mac Mail change the filename to
532
+ -- .numbers.zip, etc
533
+ if ext == 'pages' or ext == 'pages.zip' then
534
+ return 'application/vnd.apple.pages'
535
+ end
536
+ if ext == 'key' or ext == 'key.zip' then
537
+ return 'application/vnd.apple.keynote'
538
+ end
539
+ if ext == 'numbers' or ext == 'numbers.zip' then
540
+ return 'application/vnd.apple.numbers'
541
+ end
542
+
543
+ -- Otherwise just a zip
544
+ return 'application/zip'
545
+ end
546
+
547
+
548
+ -- Archives
549
+ if length > 257 then
550
+ if content:sub(258, 263) == 'ustar\x00' then
551
+ return 'application/x-tar'
552
+ end
553
+ if content:sub(258, 265) == 'ustar\x40\x40\x00' then
554
+ return 'application/x-tar'
555
+ end
556
+ end
557
+
558
+ if _1_7 == 'Rar!\x1A\x07\x00' or _1_8 == 'Rar!\x1A\x07\x01\x00' then
559
+ return 'application/x-rar-compressed'
560
+ end
561
+
562
+ if _1_2 == '\x1F\x9D' then
563
+ return 'application/x-compress'
564
+ end
565
+
566
+ if _1_2 == '\x1F\x8B' then
567
+ return 'application/x-gzip'
568
+ end
569
+
570
+ if _1_3 == 'BZh' then
571
+ return 'application/x-bzip2'
572
+ end
573
+
574
+ if _1_6 == '\xFD7zXZ\x00' then
575
+ return 'application/x-xz'
576
+ end
577
+
578
+ if _1_6 == '7z\xBC\xAF\x27\x1C' then
579
+ return 'application/x-7z-compressed'
580
+ end
581
+
582
+ if _1_2 == 'MZ' then
583
+ local pe_header_start = unpack_le(content:sub(61, 64))
584
+ local signature = content:sub(pe_header_start + 1, pe_header_start + 4)
585
+
586
+ if signature == 'PE\x00\x00' then
587
+ local image_file_header_start = pe_header_start + 5
588
+ local characteristics = content:sub(image_file_header_start + 18, image_file_header_start + 19)
589
+ local is_dll = bitwise_and(characteristics, '\x20\x00') == '\x20\x00'
590
+
591
+ if is_dll then
592
+ return 'application/x-msdownload'
593
+ end
594
+
595
+ return 'application/octet-stream'
596
+ end
597
+ end
598
+
599
+ return nil
600
+ end
601
+
602
+
603
+ function text_tests(content)
604
+ local lower_content = content:lower()
605
+
606
+ if content:find('^%%!PS-Adobe') then
607
+ return 'application/postscript'
608
+ end
609
+
610
+ if lower_content:find('<?php', 1, true) or content:find('<?=', 1, true) then
611
+ return 'application/x-httpd-php'
612
+ end
613
+
614
+ if lower_content:find('^%s*<%?xml') then
615
+ if content:find('<svg') then
616
+ return 'image/svg+xml'
617
+ end
618
+ if lower_content:find('<!doctype html') then
619
+ return 'application/xhtml+xml'
620
+ end
621
+ if content:find('<rss') then
622
+ return 'application/rss+xml'
623
+ end
624
+ return 'application/xml'
625
+ end
626
+
627
+ if lower_content:find('^%s*<html') or lower_content:find('^%s*<!doctype') then
628
+ return 'text/html'
629
+ end
630
+
631
+ if lower_content:find('^#![/a-z0-9]+ ?python') then
632
+ return 'application/x-python'
633
+ end
634
+
635
+ if lower_content:find('^#![/a-z0-9]+ ?perl') then
636
+ return 'application/x-perl'
637
+ end
638
+
639
+ if lower_content:find('^#![/a-z0-9]+ ?ruby') then
640
+ return 'application/x-ruby'
641
+ end
642
+
643
+ if lower_content:find('^#![/a-z0-9]+ ?php') then
644
+ return 'application/x-httpd-php'
645
+ end
646
+
647
+ if lower_content:find('^#![/a-z0-9]+ ?bash') then
648
+ return 'text/x-shellscript'
649
+ end
650
+
651
+ return nil
652
+ end
653
+
654
+
655
+ local ext_map = {
656
+ css = 'text/css',
657
+ csv = 'text/csv',
658
+ htm = 'text/html',
659
+ html = 'text/html',
660
+ xhtml = 'text/html',
661
+ ics = 'text/calendar',
662
+ js = 'application/javascript',
663
+ php = 'application/x-httpd-php',
664
+ php3 = 'application/x-httpd-php',
665
+ php4 = 'application/x-httpd-php',
666
+ php5 = 'application/x-httpd-php',
667
+ inc = 'application/x-httpd-php',
668
+ pl = 'application/x-perl',
669
+ cgi = 'application/x-perl',
670
+ py = 'application/x-python',
671
+ rb = 'application/x-ruby',
672
+ rhtml = 'application/x-ruby',
673
+ rss = 'application/rss+xml',
674
+ sh = 'text/x-shellscript',
675
+ tab = 'text/tab-separated-values',
676
+ vcf = 'text/x-vcard',
677
+ xml = 'application/xml'
678
+ }
679
+
680
+ function ext_tests(ext)
681
+ local mimetype = ext_map[ext]
682
+ if mimetype then
683
+ return mimetype
684
+ end
685
+ return 'text/plain'
686
+ end
687
+
688
+
689
+ local _M = {}
690
+
691
+
692
+ function _M.via_path(path, filename)
693
+ local f, err = io.open(path, 'r')
694
+ if not f then
695
+ return nil, err
696
+ end
697
+
698
+ local content = f:read(4096)
699
+ f:close()
700
+
701
+ if not filename then
702
+ filename = basename(path)
703
+ end
704
+
705
+ return _M.via_content(content, filename)
706
+ end
707
+
708
+
709
+ function _M.via_content(content, filename)
710
+ local ext = extension(filename)
711
+
712
+ -- If there are no low ASCII chars and no easily distinguishable tokens,
713
+ -- we need to detect by file extension
714
+
715
+ local mimetype = nil
716
+
717
+ mimetype = binary_tests(content, ext)
718
+ if mimetype then
719
+ return mimetype
720
+ end
721
+
722
+ -- Binary-looking files should have been detected so far
723
+ if content:find('[%z\x01-\x08\x0B\x0C\x0E-\x1F]') then
724
+ return 'application/octet-stream'
725
+ end
726
+
727
+ mimetype = text_tests(content)
728
+ if mimetype then
729
+ return mimetype
730
+ end
731
+
732
+ return ext_tests(ext)
733
+ end
734
+
735
+ return _M
src/_extensions/shafayetShafee/downloadthis/resources/css/downloadthis.css ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .downloadthis:focus,
2
+ .downloadthis:active {
3
+ box-shadow: none !important;
4
+ }
5
+
6
+ .downloadthis:hover {
7
+ transition: 0.2s;
8
+ filter: brightness(0.90);
9
+ }
10
+
11
+ .downloadthis:active {
12
+ filter: brightness(0.80);
13
+ }
src/assets/App_Market_Scraping.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
src/assets/buergergeld_forum.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae6e5db4a58dcfa7343cbcfed582e1e12d635e747834f283db2c8ee6099a8d69
3
+ size 1695491