update low code and use cases, add files
Browse files- src/03_low_code/app_market_scraping/app_market_scraping.qmd +9 -2
- src/03_low_code/catalogue/bookstoscrape.qmd +1 -1
- src/03_low_code/video_transcripts/social-media.qmd +1 -0
- src/04_use_case/forum/buergergeld_forum.ipynb +7 -0
- src/_extensions/shafayetShafee/downloadthis/_extension.yml +8 -0
- src/_extensions/shafayetShafee/downloadthis/downloadthis.lua +121 -0
- src/_extensions/shafayetShafee/downloadthis/puremagic.lua +735 -0
- src/_extensions/shafayetShafee/downloadthis/resources/css/downloadthis.css +13 -0
- src/assets/App_Market_Scraping.ipynb +0 -0
- src/assets/buergergeld_forum.zip +3 -0
src/03_low_code/app_market_scraping/app_market_scraping.qmd
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
---
|
2 |
title: "App-Market-Scraping"
|
3 |
-
description: "
|
4 |
image: _2f0cb788-71a6-4817-ab94-d38c346e4f6f.jpeg
|
|
|
|
|
|
|
|
|
|
|
5 |
---
|
6 |
|
7 |
## Lernziele
|
@@ -17,7 +22,7 @@ Um Apps zu sammeln, besuchen Sie die [Google Play Search](../../02_basics/app_ma
|
|
17 |
|
18 |
### 1. Installation des Google Play Scrapers
|
19 |
|
20 |
-
|
21 |
|
22 |
```python
|
23 |
!pip install google-play-scraper
|
@@ -85,3 +90,5 @@ plt.show()
|
|
85 |
## Fazit
|
86 |
|
87 |
Diese Schritte ermöglichen die Installation des Google Play Scrapers, das Einlesen einer CSV-Datei mit App-URLs, das Abrufen von App-Informationen und die Visualisierung der Daten.
|
|
|
|
|
|
1 |
---
|
2 |
title: "App-Market-Scraping"
|
3 |
+
description: "Extraktion und Analyse von App-Marktdaten, einschließlich benutzerdefinierter Suchparameter und Datenexport."
|
4 |
image: _2f0cb788-71a6-4817-ab94-d38c346e4f6f.jpeg
|
5 |
+
format:
|
6 |
+
html:
|
7 |
+
toc: true
|
8 |
+
code-tools: true
|
9 |
+
jupyter: python3
|
10 |
---
|
11 |
|
12 |
## Lernziele
|
|
|
22 |
|
23 |
### 1. Installation des Google Play Scrapers
|
24 |
|
25 |
+
In einem Colab Notebook, installiere die Google-Play-Scraper Bibliothek mit dem folgenden Befehl:
|
26 |
|
27 |
```python
|
28 |
!pip install google-play-scraper
|
|
|
90 |
## Fazit
|
91 |
|
92 |
Diese Schritte ermöglichen die Installation des Google Play Scrapers, das Einlesen einer CSV-Datei mit App-URLs, das Abrufen von App-Informationen und die Visualisierung der Daten.
|
93 |
+
|
94 |
+
{{< downloadthis ../../assets/App_Market_Scraping.ipynb dname="App_Market_Scraping" label="Download Notebook Beispiel" icon="journal-code" type="success" >}}
|
src/03_low_code/catalogue/bookstoscrape.qmd
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: "
|
3 |
description: "Eine Anleitung zum Scraping von Büchern von der Website Books to Scrape, einschließlich Python-Beispielen und Datenexport."
|
4 |
image: _be1bcdc2-f540-4a95-a27c-775e8f2c1c07.jpeg
|
5 |
format:
|
|
|
1 |
---
|
2 |
+
title: "Bücherkatalog scrapen"
|
3 |
description: "Eine Anleitung zum Scraping von Büchern von der Website Books to Scrape, einschließlich Python-Beispielen und Datenexport."
|
4 |
image: _be1bcdc2-f540-4a95-a27c-775e8f2c1c07.jpeg
|
5 |
format:
|
src/03_low_code/video_transcripts/social-media.qmd
CHANGED
@@ -120,6 +120,7 @@ print(profile)
|
|
120 |
Web Scraping von Social-Media-Daten ist technisch möglich, aber nur unter strikter Beachtung der rechtlichen und ethischen Vorgaben vertretbar. Die Nutzung offizieller APIs ist der empfohlene Weg, da sie den Zugriff kontrolliert und im Einklang mit den Plattformrichtlinien erlaubt.
|
121 |
|
122 |
Bevor Scraping-Projekte umgesetzt werden:
|
|
|
123 |
- **Prüfe die Nutzungsbedingungen der Plattform.**
|
124 |
- **Respektiere die Privatsphäre und Rechte der Nutzer.**
|
125 |
- **Nutze offizielle APIs, wo immer möglich.**
|
|
|
120 |
Web Scraping von Social-Media-Daten ist technisch möglich, aber nur unter strikter Beachtung der rechtlichen und ethischen Vorgaben vertretbar. Die Nutzung offizieller APIs ist der empfohlene Weg, da sie den Zugriff kontrolliert und im Einklang mit den Plattformrichtlinien erlaubt.
|
121 |
|
122 |
Bevor Scraping-Projekte umgesetzt werden:
|
123 |
+
|
124 |
- **Prüfe die Nutzungsbedingungen der Plattform.**
|
125 |
- **Respektiere die Privatsphäre und Rechte der Nutzer.**
|
126 |
- **Nutze offizielle APIs, wo immer möglich.**
|
src/04_use_case/forum/buergergeld_forum.ipynb
CHANGED
@@ -87,6 +87,13 @@
|
|
87 |
"# download_pages(base_url, start_page, end_page, output_directory)"
|
88 |
]
|
89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
{
|
91 |
"cell_type": "markdown",
|
92 |
"metadata": {},
|
|
|
87 |
"# download_pages(base_url, start_page, end_page, output_directory)"
|
88 |
]
|
89 |
},
|
90 |
+
{
|
91 |
+
"cell_type": "markdown",
|
92 |
+
"metadata": {},
|
93 |
+
"source": [
|
94 |
+
"Um das wiederholte Abfragen gleicher Inhalte zu vermeiden, können die gesammelten Seiten als Ordner heruntergeladen werden: [buergergeld_forum.zip](https://huggingface.co/spaces/datenwerkzeuge/CDL-Webscraping-Workshop-2025/blob/main/src/assets/buergergeld_forum.zip)"
|
95 |
+
]
|
96 |
+
},
|
97 |
{
|
98 |
"cell_type": "markdown",
|
99 |
"metadata": {},
|
src/_extensions/shafayetShafee/downloadthis/_extension.yml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: Downloadthis
|
2 |
+
author: Shafayet Khan Shafee
|
3 |
+
version: 1.1.0
|
4 |
+
quarto-required: ">=1.2.0"
|
5 |
+
contributes:
|
6 |
+
shortcodes:
|
7 |
+
- downloadthis.lua
|
8 |
+
|
src/_extensions/shafayetShafee/downloadthis/downloadthis.lua
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--[[
|
2 |
+
MIT License
|
3 |
+
|
4 |
+
Copyright (c) 2023 Shafayet Khan Shafee
|
5 |
+
|
6 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 |
+
of this software and associated documentation files (the "Software"), to deal
|
8 |
+
in the Software without restriction, including without limitation the rights
|
9 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
+
copies of the Software, and to permit persons to whom the Software is
|
11 |
+
furnished to do so, subject to the following conditions:
|
12 |
+
|
13 |
+
The above copyright notice and this permission notice shall be included in all
|
14 |
+
copies or substantial portions of the Software.
|
15 |
+
|
16 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22 |
+
SOFTWARE.
|
23 |
+
]]--
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
local str = pandoc.utils.stringify
|
28 |
+
--local p = quarto.log.output
|
29 |
+
|
30 |
+
local function ensureHtmlDeps()
|
31 |
+
quarto.doc.add_html_dependency({
|
32 |
+
name = "downloadthis",
|
33 |
+
version = "1.9.1",
|
34 |
+
stylesheets = {"resources/css/downloadthis.css"}
|
35 |
+
})
|
36 |
+
end
|
37 |
+
|
38 |
+
local function optional(arg, default)
|
39 |
+
if arg == nil or arg == ""
|
40 |
+
then
|
41 |
+
return default
|
42 |
+
else
|
43 |
+
return arg
|
44 |
+
end
|
45 |
+
end
|
46 |
+
|
47 |
+
function import(script)
|
48 |
+
local path = PANDOC_SCRIPT_FILE:match("(.*[/\\])")
|
49 |
+
package.path = path .. script .. ";" .. package.path
|
50 |
+
return require(script)
|
51 |
+
end
|
52 |
+
|
53 |
+
local puremagic = import("puremagic.lua")
|
54 |
+
|
55 |
+
return {
|
56 |
+
['downloadthis'] = function(args, kwargs, meta)
|
57 |
+
|
58 |
+
-- args and kwargs
|
59 |
+
local file_path = str(args[1])
|
60 |
+
local extension = "." .. file_path:match("[^.]+$")
|
61 |
+
local dname = optional(str(kwargs["dname"]), "file")
|
62 |
+
local dfilename = dname .. extension
|
63 |
+
local btn_label = " " .. optional(str(kwargs["label"]), "Download") .. " "
|
64 |
+
local btn_type = optional(str(kwargs["type"]), "default")
|
65 |
+
local icon = optional(str(kwargs["icon"]), "download")
|
66 |
+
local class = " " .. optional(str(kwargs["class"]), "")
|
67 |
+
local rand = "dnldts" .. str(math.random(1, 65000))
|
68 |
+
local id = optional(str(kwargs["id"]), rand)
|
69 |
+
-- reading files
|
70 |
+
local fh = io.open(file_path, "rb")
|
71 |
+
if not fh then
|
72 |
+
io.stderr:write("Cannot open file " ..
|
73 |
+
file_path ..
|
74 |
+
" | Skipping adding buttons\n")
|
75 |
+
return pandoc.Null()
|
76 |
+
else
|
77 |
+
local contents = fh:read("*all")
|
78 |
+
fh:close()
|
79 |
+
|
80 |
+
-- creating dataURI object
|
81 |
+
local b64_encoded = quarto.base64.encode(contents)
|
82 |
+
local mimetype = puremagic.via_path(file_path)
|
83 |
+
local data_uri = 'data:' .. mimetype .. ";base64," .. b64_encoded
|
84 |
+
|
85 |
+
-- js code taken from
|
86 |
+
-- https://github.com/fmmattioni/downloadthis/blob/master/R/utils.R#L59
|
87 |
+
local js = [[fetch('%s').then(res => res.blob()).then(blob => {
|
88 |
+
const downloadURL = window.URL.createObjectURL(blob);
|
89 |
+
const a = document.createElement('a');
|
90 |
+
document.body.appendChild(a);
|
91 |
+
a.href = downloadURL;
|
92 |
+
a.download = '%s'; a.click();
|
93 |
+
window.URL.revokeObjectURL(downloadURL);
|
94 |
+
document.body.removeChild(a);
|
95 |
+
});]]
|
96 |
+
|
97 |
+
local clicked = js:format(data_uri, dfilename)
|
98 |
+
|
99 |
+
-- creating button
|
100 |
+
local button =
|
101 |
+
"<button class=\"btn btn-" .. btn_type .. " downloadthis " ..
|
102 |
+
class .. "\"" ..
|
103 |
+
" id=\"" .. id .. "\"" ..
|
104 |
+
"><i class=\"bi bi-" .. icon .. "\"" .. "></i>" ..
|
105 |
+
btn_label ..
|
106 |
+
"</button>"
|
107 |
+
if quarto.doc.is_format("html:js") and quarto.doc.has_bootstrap()
|
108 |
+
then
|
109 |
+
ensureHtmlDeps()
|
110 |
+
return pandoc.RawInline('html',
|
111 |
+
"<a href=\"#" .. id .. "\"" ..
|
112 |
+
" onclick=\"" .. clicked .. "\">" .. button .. "</a>"
|
113 |
+
)
|
114 |
+
else
|
115 |
+
return pandoc.Null()
|
116 |
+
end
|
117 |
+
end
|
118 |
+
end
|
119 |
+
}
|
120 |
+
|
121 |
+
|
src/_extensions/shafayetShafee/downloadthis/puremagic.lua
ADDED
@@ -0,0 +1,735 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-- puremagic 1.0.1
|
2 |
+
-- Copyright (c) 2014 Will Bond <[email protected]>
|
3 |
+
-- Licensed under the MIT license.
|
4 |
+
|
5 |
+
|
6 |
+
function basename(path)
|
7 |
+
local basename_match = path:match('[/\\]([^/\\]+)$')
|
8 |
+
if basename_match then
|
9 |
+
return basename_match, nil
|
10 |
+
end
|
11 |
+
|
12 |
+
return path, nil
|
13 |
+
end
|
14 |
+
|
15 |
+
|
16 |
+
function extension(path)
|
17 |
+
path = path:lower()
|
18 |
+
local tar_match = path:match('%.(tar%.[^.]+)$')
|
19 |
+
if tar_match then
|
20 |
+
return tar_match
|
21 |
+
end
|
22 |
+
if path:sub(#path - 11, #path) == '.numbers.zip' then
|
23 |
+
return 'numbers.zip'
|
24 |
+
end
|
25 |
+
if path:sub(#path - 9, #path) == '.pages.zip' then
|
26 |
+
return 'pages.zip'
|
27 |
+
end
|
28 |
+
if path:sub(#path - 7, #path) == '.key.zip' then
|
29 |
+
return 'key.zip'
|
30 |
+
end
|
31 |
+
return path:match('%.([^.]+)$')
|
32 |
+
end
|
33 |
+
|
34 |
+
|
35 |
+
function in_table(value, list)
|
36 |
+
for i=1, #list do
|
37 |
+
if list[i] == value then
|
38 |
+
return true
|
39 |
+
end
|
40 |
+
end
|
41 |
+
return false
|
42 |
+
end
|
43 |
+
|
44 |
+
|
45 |
+
function string_to_bit_table(chars)
|
46 |
+
local output = {}
|
47 |
+
for char in chars:gmatch('.') do
|
48 |
+
local num = string.byte(char)
|
49 |
+
local bits = {0, 0, 0, 0, 0, 0, 0, 0}
|
50 |
+
for bit=8, 1, -1 do
|
51 |
+
if num > 0 then
|
52 |
+
bits[bit] = math.fmod(num, 2)
|
53 |
+
num = (num - bits[bit]) / 2
|
54 |
+
end
|
55 |
+
end
|
56 |
+
table.insert(output, bits)
|
57 |
+
end
|
58 |
+
return output
|
59 |
+
end
|
60 |
+
|
61 |
+
|
62 |
+
function bit_table_to_string(bits)
|
63 |
+
local output = {}
|
64 |
+
for i = 1, #bits do
|
65 |
+
local num = tonumber(table.concat(bits[i]), 2)
|
66 |
+
table.insert(output, string.format('%c', num))
|
67 |
+
end
|
68 |
+
return table.concat(output)
|
69 |
+
end
|
70 |
+
|
71 |
+
|
72 |
+
function bitwise_and(a, b)
|
73 |
+
local a_bytes = string_to_bit_table(a)
|
74 |
+
local b_bytes = string_to_bit_table(b)
|
75 |
+
|
76 |
+
local output = {}
|
77 |
+
for i = 1, #a_bytes do
|
78 |
+
local bits = {0, 0, 0, 0, 0, 0, 0, 0}
|
79 |
+
for j = 1, 8 do
|
80 |
+
if a_bytes[i][j] == 1 and b_bytes[i][j] == 1 then
|
81 |
+
bits[j] = 1
|
82 |
+
else
|
83 |
+
bits[j] = 0
|
84 |
+
end
|
85 |
+
end
|
86 |
+
table.insert(output, bits)
|
87 |
+
end
|
88 |
+
|
89 |
+
return bit_table_to_string(output)
|
90 |
+
end
|
91 |
+
|
92 |
+
|
93 |
+
-- Unpack a little endian byte string into an integer
|
94 |
+
function unpack_le(chars)
|
95 |
+
local bit_table = string_to_bit_table(chars)
|
96 |
+
-- Merge the bits into a string of 1s and 0s
|
97 |
+
local result = {}
|
98 |
+
for i=1, #bit_table do
|
99 |
+
result[#chars + 1 - i] = table.concat(bit_table[i])
|
100 |
+
end
|
101 |
+
return tonumber(table.concat(result), 2)
|
102 |
+
end
|
103 |
+
|
104 |
+
|
105 |
+
-- Unpack a big endian byte string into an integer
|
106 |
+
function unpack_be(chars)
|
107 |
+
local bit_table = string_to_bit_table(chars)
|
108 |
+
-- Merge the bits into a string of 1s and 0s
|
109 |
+
for i=1, #bit_table do
|
110 |
+
bit_table[i] = table.concat(bit_table[i])
|
111 |
+
end
|
112 |
+
return tonumber(table.concat(bit_table), 2)
|
113 |
+
end
|
114 |
+
|
115 |
+
|
116 |
+
-- Takes the first 4-8k of an EBML file and identifies if it is matroska or webm
|
117 |
+
-- and it it contains just video or just audio.
|
118 |
+
function ebml_parse(content)
|
119 |
+
local position = 1
|
120 |
+
local length = #content
|
121 |
+
|
122 |
+
local header_token, header_value, used_bytes = ebml_parse_section(content)
|
123 |
+
position = position + used_bytes
|
124 |
+
|
125 |
+
|
126 |
+
if header_token ~= '\x1AE\xDF\xA3' then
|
127 |
+
return nil, 'Unable to find EBML ID'
|
128 |
+
end
|
129 |
+
|
130 |
+
-- The matroska spec sets the default doctype to be 'matroska', however
|
131 |
+
-- many file specify this anyway. The other option is 'webm'.
|
132 |
+
local doctype = 'matroska'
|
133 |
+
if header_value['B\x82'] then
|
134 |
+
doctype = header_value['B\x82']
|
135 |
+
end
|
136 |
+
|
137 |
+
if doctype ~= 'matroska' and doctype ~= 'webm' then
|
138 |
+
return nil, 'Unknown EBML doctype'
|
139 |
+
end
|
140 |
+
|
141 |
+
local segment_position = nil
|
142 |
+
local track_position = nil
|
143 |
+
local has_video = false
|
144 |
+
local found_tracks = false
|
145 |
+
|
146 |
+
while position <= length do
|
147 |
+
local ebml_id, ebml_value, used_bytes = ebml_parse_section(content:sub(position, length))
|
148 |
+
position = position + used_bytes
|
149 |
+
|
150 |
+
-- Segment
|
151 |
+
if ebml_id == '\x18S\x80g' then
|
152 |
+
segment_position = position
|
153 |
+
end
|
154 |
+
|
155 |
+
-- Meta seek information
|
156 |
+
if ebml_id == '\x11M\x9Bt' then
|
157 |
+
-- Look for the seek info about the tracks token
|
158 |
+
for i, child in ipairs(ebml_value['M\xBB']) do
|
159 |
+
if child['S\xAB'] == '\x16T\xAEk' then
|
160 |
+
track_position = segment_position + unpack_be(child['S\xAC'])
|
161 |
+
position = track_position
|
162 |
+
break
|
163 |
+
end
|
164 |
+
end
|
165 |
+
end
|
166 |
+
|
167 |
+
-- Track
|
168 |
+
if ebml_id == '\x16T\xAEk' then
|
169 |
+
found_tracks = true
|
170 |
+
-- Scan through each track looking for video
|
171 |
+
for i, child in ipairs(ebml_value['\xAE']) do
|
172 |
+
-- Look to see if the track type is video
|
173 |
+
if unpack_be(child['\x83']) == 1 then
|
174 |
+
has_video = true
|
175 |
+
break
|
176 |
+
end
|
177 |
+
end
|
178 |
+
break
|
179 |
+
end
|
180 |
+
end
|
181 |
+
|
182 |
+
if found_tracks and not has_video then
|
183 |
+
if doctype == 'matroska' then
|
184 |
+
return 'audio/x-matroska'
|
185 |
+
else
|
186 |
+
return 'audio/webm'
|
187 |
+
end
|
188 |
+
end
|
189 |
+
|
190 |
+
if doctype == 'matroska' then
|
191 |
+
return 'video/x-matroska'
|
192 |
+
else
|
193 |
+
return 'video/webm'
|
194 |
+
end
|
195 |
+
end
|
196 |
+
|
197 |
+
|
198 |
+
-- Parses a section of an EBML document, returning the EBML ID at the beginning,
|
199 |
+
-- plus the value as a table with child EBML IDs as keys and the number of
|
200 |
+
-- bytes from the content that contained the ID and value
|
201 |
+
function ebml_parse_section(content)
|
202 |
+
local ebml_id, element_length, used_bytes = ebml_id_and_length(content)
|
203 |
+
|
204 |
+
-- Don't parse the segment since it is the whole file!
|
205 |
+
if ebml_id == '\x18\x53\x80\x67' then
|
206 |
+
return ebml_id, nil, used_bytes
|
207 |
+
end
|
208 |
+
|
209 |
+
local ebml_value = content:sub(used_bytes + 1, used_bytes + element_length)
|
210 |
+
used_bytes = used_bytes + element_length
|
211 |
+
|
212 |
+
-- We always parse the return value of level 0/1 elements
|
213 |
+
local recursive_parse = false
|
214 |
+
if #ebml_id == 4 then
|
215 |
+
recursive_parse = true
|
216 |
+
|
217 |
+
-- We need Seek information
|
218 |
+
elseif ebml_id == '\x4D\xBB' then
|
219 |
+
recursive_parse = true
|
220 |
+
|
221 |
+
-- We want the top-level of TrackEntry to grab the TrackType
|
222 |
+
elseif ebml_id == '\xAE' then
|
223 |
+
recursive_parse = true
|
224 |
+
end
|
225 |
+
|
226 |
+
if recursive_parse then
|
227 |
+
local buffer = ebml_value
|
228 |
+
ebml_value = {}
|
229 |
+
|
230 |
+
-- Track which child entries have been converted to an array
|
231 |
+
local array_children = {}
|
232 |
+
|
233 |
+
while #buffer > 0 do
|
234 |
+
local child_ebml_id, child_ebml_value, child_used_bytes = ebml_parse_section(buffer)
|
235 |
+
|
236 |
+
if array_children[child_ebml_id] then
|
237 |
+
table.insert(ebml_value[child_ebml_id], child_ebml_value)
|
238 |
+
|
239 |
+
-- Single values are just stores by themselves
|
240 |
+
elseif ebml_value[child_ebml_id] == nil then
|
241 |
+
-- Force seek info and tracks to be arrays even if there is only one
|
242 |
+
if child_ebml_id == 'M\xBB' or child_ebml_id == '\xAE' then
|
243 |
+
child_ebml_value = {child_ebml_value}
|
244 |
+
array_children[child_ebml_id] = true
|
245 |
+
end
|
246 |
+
ebml_value[child_ebml_id] = child_ebml_value
|
247 |
+
|
248 |
+
-- If there is already a value for the ID, turn it into a table
|
249 |
+
else
|
250 |
+
ebml_value[child_ebml_id] = {ebml_value[child_ebml_id], child_ebml_value}
|
251 |
+
array_children[child_ebml_id] = true
|
252 |
+
end
|
253 |
+
|
254 |
+
-- Move past the part we've parsed
|
255 |
+
buffer = buffer:sub(child_used_bytes + 1, #buffer)
|
256 |
+
end
|
257 |
+
end
|
258 |
+
|
259 |
+
return ebml_id, ebml_value, used_bytes
|
260 |
+
end
|
261 |
+
|
262 |
+
|
263 |
+
-- Should accept 12+ bytes, will return the ebml id, the data length and the
|
264 |
+
-- number of bytes that were used to hold those values.
|
265 |
+
function ebml_id_and_length(chars)
|
266 |
+
-- The ID is encoded the same way as the length, however, we don't want
|
267 |
+
-- to remove the length bits from the ID value or intepret it as an
|
268 |
+
-- unsigned int since all of the documentation online references the IDs in
|
269 |
+
-- encoded form.
|
270 |
+
local _, id_length = ebml_length(chars:sub(1, 4))
|
271 |
+
local ebml_id = chars:sub(1, id_length)
|
272 |
+
|
273 |
+
local remaining = chars:sub(id_length + 1, id_length + 8)
|
274 |
+
local element_length, used_bytes = ebml_length(remaining)
|
275 |
+
|
276 |
+
return ebml_id, element_length, id_length + used_bytes
|
277 |
+
end
|
278 |
+
|
279 |
+
|
280 |
+
-- Should accept 8+ bytes, will return the data length plus the number of bytes
|
281 |
+
-- that were used to hold the data length.
|
282 |
+
function ebml_length(chars)
|
283 |
+
-- We substring chars to ensure we don't build a huge table we don't need
|
284 |
+
local bit_tables = string_to_bit_table(chars:sub(1, 8))
|
285 |
+
|
286 |
+
local value_length = 1
|
287 |
+
for i=1, #bit_tables[1] do
|
288 |
+
if bit_tables[1][i] == 0 then
|
289 |
+
value_length = value_length + 1
|
290 |
+
else
|
291 |
+
-- Clear the indicator bit so the rest of the byte
|
292 |
+
bit_tables[1][i] = 0
|
293 |
+
break
|
294 |
+
end
|
295 |
+
end
|
296 |
+
|
297 |
+
local bits = {}
|
298 |
+
for i=1, value_length do
|
299 |
+
table.insert(bits, table.concat(bit_tables[i]))
|
300 |
+
end
|
301 |
+
|
302 |
+
return tonumber(table.concat(bits), 2), value_length
|
303 |
+
end
|
304 |
+
|
305 |
+
|
306 |
+
function binary_tests(content, ext)
|
307 |
+
local length = #content
|
308 |
+
local _1_8 = content:sub(1, 8)
|
309 |
+
local _1_7 = content:sub(1, 7)
|
310 |
+
local _1_6 = content:sub(1, 6)
|
311 |
+
local _1_5 = content:sub(1, 5)
|
312 |
+
local _1_4 = content:sub(1, 4)
|
313 |
+
local _1_3 = content:sub(1, 3)
|
314 |
+
local _1_2 = content:sub(1, 2)
|
315 |
+
local _9_12 = content:sub(9, 12)
|
316 |
+
|
317 |
+
|
318 |
+
-- Images
|
319 |
+
if _1_4 == '\xC5\xD0\xD3\xC6' then
|
320 |
+
-- With a Windows-format EPS, the file starts right after a 30-byte
|
321 |
+
-- header, or a 30-byte header followed by two bytes of padding
|
322 |
+
if content:sub(33, 42) == '%!PS-Adobe' or content:sub(31, 40) == '%!PS-Adobe' then
|
323 |
+
return 'application/postscript'
|
324 |
+
end
|
325 |
+
end
|
326 |
+
|
327 |
+
if _1_8 == '%!PS-Ado' and content:sub(9, 10) == 'be' then
|
328 |
+
return 'application/postscript'
|
329 |
+
end
|
330 |
+
|
331 |
+
if _1_4 == 'MM\x00*' or _1_4 == 'II*\x00' then
|
332 |
+
return 'image/tiff'
|
333 |
+
end
|
334 |
+
|
335 |
+
if _1_8 == '\x89PNG\r\n\x1A\n' then
|
336 |
+
return 'image/png'
|
337 |
+
end
|
338 |
+
|
339 |
+
if _1_6 == 'GIF87a' or _1_6 == 'GIF89a' then
|
340 |
+
return 'image/gif'
|
341 |
+
end
|
342 |
+
|
343 |
+
if _1_4 == 'RIFF' and _9_12 == 'WEBP' then
|
344 |
+
return 'image/webp'
|
345 |
+
end
|
346 |
+
|
347 |
+
if _1_2 == 'BM' and length > 14 and in_table(content:sub(15, 15), {'\x0C', '(', '@', '\x80'}) then
|
348 |
+
return 'image/x-ms-bmp'
|
349 |
+
end
|
350 |
+
|
351 |
+
local normal_jpeg = length > 10 and in_table(content:sub(7, 10), {'JFIF', 'Exif'})
|
352 |
+
local photoshop_jpeg = length > 24 and _1_4 == '\xFF\xD8\xFF\xED' and content:sub(21, 24) == '8BIM'
|
353 |
+
if normal_jpeg or photoshop_jpeg then
|
354 |
+
return 'image/jpeg'
|
355 |
+
end
|
356 |
+
|
357 |
+
if _1_4 == '8BPS' then
|
358 |
+
return 'image/vnd.adobe.photoshop'
|
359 |
+
end
|
360 |
+
|
361 |
+
if _1_8 == '\x00\x00\x00\x0CjP ' and _9_12 == '\r\n\x87\n' then
|
362 |
+
return 'image/jp2'
|
363 |
+
end
|
364 |
+
|
365 |
+
if _1_4 == '\x00\x00\x01\x00' then
|
366 |
+
return 'application/vnd.microsoft.icon'
|
367 |
+
end
|
368 |
+
|
369 |
+
|
370 |
+
-- Audio/Video
|
371 |
+
if _1_4 == '\x1AE\xDF\xA3' and length > 1000 then
|
372 |
+
local mimetype, err = ebml_parse(content)
|
373 |
+
|
374 |
+
if mimetype then
|
375 |
+
return mimetype
|
376 |
+
end
|
377 |
+
end
|
378 |
+
|
379 |
+
if _1_4 == 'MOVI' then
|
380 |
+
if in_table(content:sub(5, 8), {'moov', 'mdat'}) then
|
381 |
+
return 'video/quicktime'
|
382 |
+
end
|
383 |
+
end
|
384 |
+
|
385 |
+
if length > 8 and content:sub(5, 8) == 'ftyp' then
|
386 |
+
local lower_9_12 = _9_12:lower()
|
387 |
+
|
388 |
+
if in_table(lower_9_12, {'avc1', 'isom', 'iso2', 'mp41', 'mp42', 'mmp4', 'ndsc', 'ndsh', 'ndsm', 'ndsp', 'ndss', 'ndxc', 'ndxh', 'ndxm', 'ndxp', 'ndxs', 'f4v ', 'f4p ', 'm4v '}) then
|
389 |
+
return 'video/mp4'
|
390 |
+
end
|
391 |
+
|
392 |
+
if in_table(lower_9_12, {'msnv', 'ndas', 'f4a ', 'f4b ', 'm4a ', 'm4b ', 'm4p '}) then
|
393 |
+
return 'audio/mp4'
|
394 |
+
end
|
395 |
+
|
396 |
+
if in_table(lower_9_12, {'3g2a', '3g2b', '3g2c', 'kddi'}) then
|
397 |
+
return 'video/3gpp2'
|
398 |
+
end
|
399 |
+
|
400 |
+
if in_table(lower_9_12, {'3ge6', '3ge7', '3gg6', '3gp1', '3gp2', '3gp3', '3gp4', '3gp5', '3gp6', '3gs7'}) then
|
401 |
+
return 'video/3gpp'
|
402 |
+
end
|
403 |
+
|
404 |
+
if lower_9_12 == 'mqt ' or lower_9_12 == 'qt ' then
|
405 |
+
return 'video/quicktime'
|
406 |
+
end
|
407 |
+
|
408 |
+
if lower_9_12 == 'jp2 ' then
|
409 |
+
return 'image/jp2'
|
410 |
+
end
|
411 |
+
end
|
412 |
+
|
413 |
+
-- MP3
|
414 |
+
if bitwise_and(_1_2, '\xFF\xF6') == '\xFF\xF2' then
|
415 |
+
local byte_3 = content:sub(3, 3)
|
416 |
+
if bitwise_and(byte_3, '\xF0') ~= '\xF0' and bitwise_and(byte_3, "\x0C") ~= "\x0C" then
|
417 |
+
return 'audio/mpeg'
|
418 |
+
end
|
419 |
+
end
|
420 |
+
if _1_3 == 'ID3' then
|
421 |
+
return 'audio/mpeg'
|
422 |
+
end
|
423 |
+
|
424 |
+
if _1_4 == 'fLaC' then
|
425 |
+
return 'audio/x-flac'
|
426 |
+
end
|
427 |
+
|
428 |
+
if _1_8 == '0&\xB2u\x8Ef\xCF\x11' then
|
429 |
+
-- Without writing a full-on ASF parser, we can just scan for the
|
430 |
+
-- UTF-16 string "AspectRatio"
|
431 |
+
if content:find('\x00A\x00s\x00p\x00e\x00c\x00t\x00R\x00a\x00t\x00i\x00o', 1, true) then
|
432 |
+
return 'video/x-ms-wmv'
|
433 |
+
end
|
434 |
+
return 'audio/x-ms-wma'
|
435 |
+
end
|
436 |
+
|
437 |
+
if _1_4 == 'RIFF' and _9_12 == 'AVI ' then
|
438 |
+
return 'video/x-msvideo'
|
439 |
+
end
|
440 |
+
|
441 |
+
if _1_4 == 'RIFF' and _9_12 == 'WAVE' then
|
442 |
+
return 'audio/x-wav'
|
443 |
+
end
|
444 |
+
|
445 |
+
if _1_4 == 'FORM' and _9_12 == 'AIFF' then
|
446 |
+
return 'audio/x-aiff'
|
447 |
+
end
|
448 |
+
|
449 |
+
if _1_4 == 'OggS' then
|
450 |
+
local _29_33 = content:sub(29, 33)
|
451 |
+
if _29_33 == '\x01vorb' then
|
452 |
+
return 'audio/vorbis'
|
453 |
+
end
|
454 |
+
if _29_33 == '\x07FLAC' then
|
455 |
+
return 'audio/x-flac'
|
456 |
+
end
|
457 |
+
if _29_33 == 'OpusH' then
|
458 |
+
return 'audio/ogg'
|
459 |
+
end
|
460 |
+
-- Theora and OGM
|
461 |
+
if _29_33 == '\x80theo' or _29_33 == 'vide' then
|
462 |
+
return 'video/ogg'
|
463 |
+
end
|
464 |
+
end
|
465 |
+
|
466 |
+
if _1_3 == 'FWS' or _1_3 == 'CWS' then
|
467 |
+
return 'application/x-shockwave-flash'
|
468 |
+
end
|
469 |
+
|
470 |
+
if _1_3 == 'FLV' then
|
471 |
+
return 'video/x-flv'
|
472 |
+
end
|
473 |
+
|
474 |
+
|
475 |
+
if _1_5 == '%PDF-' then
|
476 |
+
return 'application/pdf'
|
477 |
+
end
|
478 |
+
|
479 |
+
if _1_5 == '{\\rtf' then
|
480 |
+
return 'text/rtf'
|
481 |
+
end
|
482 |
+
|
483 |
+
|
484 |
+
-- Office '97-2003 formats
|
485 |
+
if _1_8 == '\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' then
|
486 |
+
if in_table(ext, {'xls', 'csv', 'tab'}) then
|
487 |
+
return 'application/vnd.ms-excel'
|
488 |
+
end
|
489 |
+
if ext == 'ppt' then
|
490 |
+
return 'application/vnd.ms-powerpoint'
|
491 |
+
end
|
492 |
+
-- We default to word since we need something if the extension isn't recognized
|
493 |
+
return 'application/msword'
|
494 |
+
end
|
495 |
+
|
496 |
+
if _1_8 == '\x09\x04\x06\x00\x00\x00\x10\x00' then
|
497 |
+
return 'application/vnd.ms-excel'
|
498 |
+
end
|
499 |
+
|
500 |
+
if _1_6 == '\xDB\xA5\x2D\x00\x00\x00' or _1_5 == '\x50\x4F\x5E\x51\x60' or _1_4 == '\xFE\x37\x00\x23' or _1_3 == '\x94\xA6\x2E' then
|
501 |
+
return 'application/msword'
|
502 |
+
end
|
503 |
+
|
504 |
+
if _1_4 == 'PK\x03\x04' then
|
505 |
+
-- Office XML formats
|
506 |
+
if ext == 'xlsx' then
|
507 |
+
return 'application/vnd.ms-excel'
|
508 |
+
end
|
509 |
+
|
510 |
+
if ext == 'pptx' then
|
511 |
+
return 'application/vnd.ms-powerpoint'
|
512 |
+
end
|
513 |
+
|
514 |
+
if ext == 'docx' then
|
515 |
+
return 'application/msword'
|
516 |
+
end
|
517 |
+
|
518 |
+
-- Open Office formats
|
519 |
+
if ext == 'ods' then
|
520 |
+
return 'application/vnd.oasis.opendocument.spreadsheet'
|
521 |
+
end
|
522 |
+
|
523 |
+
if ext == 'odp' then
|
524 |
+
return 'application/vnd.oasis.opendocument.presentation'
|
525 |
+
end
|
526 |
+
|
527 |
+
if ext == 'odt' then
|
528 |
+
return 'application/vnd.oasis.opendocument.text'
|
529 |
+
end
|
530 |
+
|
531 |
+
-- iWork - some programs like Mac Mail change the filename to
|
532 |
+
-- .numbers.zip, etc
|
533 |
+
if ext == 'pages' or ext == 'pages.zip' then
|
534 |
+
return 'application/vnd.apple.pages'
|
535 |
+
end
|
536 |
+
if ext == 'key' or ext == 'key.zip' then
|
537 |
+
return 'application/vnd.apple.keynote'
|
538 |
+
end
|
539 |
+
if ext == 'numbers' or ext == 'numbers.zip' then
|
540 |
+
return 'application/vnd.apple.numbers'
|
541 |
+
end
|
542 |
+
|
543 |
+
-- Otherwise just a zip
|
544 |
+
return 'application/zip'
|
545 |
+
end
|
546 |
+
|
547 |
+
|
548 |
+
-- Archives
|
549 |
+
if length > 257 then
|
550 |
+
if content:sub(258, 263) == 'ustar\x00' then
|
551 |
+
return 'application/x-tar'
|
552 |
+
end
|
553 |
+
if content:sub(258, 265) == 'ustar\x40\x40\x00' then
|
554 |
+
return 'application/x-tar'
|
555 |
+
end
|
556 |
+
end
|
557 |
+
|
558 |
+
if _1_7 == 'Rar!\x1A\x07\x00' or _1_8 == 'Rar!\x1A\x07\x01\x00' then
|
559 |
+
return 'application/x-rar-compressed'
|
560 |
+
end
|
561 |
+
|
562 |
+
if _1_2 == '\x1F\x9D' then
|
563 |
+
return 'application/x-compress'
|
564 |
+
end
|
565 |
+
|
566 |
+
if _1_2 == '\x1F\x8B' then
|
567 |
+
return 'application/x-gzip'
|
568 |
+
end
|
569 |
+
|
570 |
+
if _1_3 == 'BZh' then
|
571 |
+
return 'application/x-bzip2'
|
572 |
+
end
|
573 |
+
|
574 |
+
if _1_6 == '\xFD7zXZ\x00' then
|
575 |
+
return 'application/x-xz'
|
576 |
+
end
|
577 |
+
|
578 |
+
if _1_6 == '7z\xBC\xAF\x27\x1C' then
|
579 |
+
return 'application/x-7z-compressed'
|
580 |
+
end
|
581 |
+
|
582 |
+
if _1_2 == 'MZ' then
|
583 |
+
local pe_header_start = unpack_le(content:sub(61, 64))
|
584 |
+
local signature = content:sub(pe_header_start + 1, pe_header_start + 4)
|
585 |
+
|
586 |
+
if signature == 'PE\x00\x00' then
|
587 |
+
local image_file_header_start = pe_header_start + 5
|
588 |
+
local characteristics = content:sub(image_file_header_start + 18, image_file_header_start + 19)
|
589 |
+
local is_dll = bitwise_and(characteristics, '\x20\x00') == '\x20\x00'
|
590 |
+
|
591 |
+
if is_dll then
|
592 |
+
return 'application/x-msdownload'
|
593 |
+
end
|
594 |
+
|
595 |
+
return 'application/octet-stream'
|
596 |
+
end
|
597 |
+
end
|
598 |
+
|
599 |
+
return nil
|
600 |
+
end
|
601 |
+
|
602 |
+
|
603 |
+
function text_tests(content)
|
604 |
+
local lower_content = content:lower()
|
605 |
+
|
606 |
+
if content:find('^%%!PS-Adobe') then
|
607 |
+
return 'application/postscript'
|
608 |
+
end
|
609 |
+
|
610 |
+
if lower_content:find('<?php', 1, true) or content:find('<?=', 1, true) then
|
611 |
+
return 'application/x-httpd-php'
|
612 |
+
end
|
613 |
+
|
614 |
+
if lower_content:find('^%s*<%?xml') then
|
615 |
+
if content:find('<svg') then
|
616 |
+
return 'image/svg+xml'
|
617 |
+
end
|
618 |
+
if lower_content:find('<!doctype html') then
|
619 |
+
return 'application/xhtml+xml'
|
620 |
+
end
|
621 |
+
if content:find('<rss') then
|
622 |
+
return 'application/rss+xml'
|
623 |
+
end
|
624 |
+
return 'application/xml'
|
625 |
+
end
|
626 |
+
|
627 |
+
if lower_content:find('^%s*<html') or lower_content:find('^%s*<!doctype') then
|
628 |
+
return 'text/html'
|
629 |
+
end
|
630 |
+
|
631 |
+
if lower_content:find('^#![/a-z0-9]+ ?python') then
|
632 |
+
return 'application/x-python'
|
633 |
+
end
|
634 |
+
|
635 |
+
if lower_content:find('^#![/a-z0-9]+ ?perl') then
|
636 |
+
return 'application/x-perl'
|
637 |
+
end
|
638 |
+
|
639 |
+
if lower_content:find('^#![/a-z0-9]+ ?ruby') then
|
640 |
+
return 'application/x-ruby'
|
641 |
+
end
|
642 |
+
|
643 |
+
if lower_content:find('^#![/a-z0-9]+ ?php') then
|
644 |
+
return 'application/x-httpd-php'
|
645 |
+
end
|
646 |
+
|
647 |
+
if lower_content:find('^#![/a-z0-9]+ ?bash') then
|
648 |
+
return 'text/x-shellscript'
|
649 |
+
end
|
650 |
+
|
651 |
+
return nil
|
652 |
+
end
|
653 |
+
|
654 |
+
|
655 |
+
local ext_map = {
|
656 |
+
css = 'text/css',
|
657 |
+
csv = 'text/csv',
|
658 |
+
htm = 'text/html',
|
659 |
+
html = 'text/html',
|
660 |
+
xhtml = 'text/html',
|
661 |
+
ics = 'text/calendar',
|
662 |
+
js = 'application/javascript',
|
663 |
+
php = 'application/x-httpd-php',
|
664 |
+
php3 = 'application/x-httpd-php',
|
665 |
+
php4 = 'application/x-httpd-php',
|
666 |
+
php5 = 'application/x-httpd-php',
|
667 |
+
inc = 'application/x-httpd-php',
|
668 |
+
pl = 'application/x-perl',
|
669 |
+
cgi = 'application/x-perl',
|
670 |
+
py = 'application/x-python',
|
671 |
+
rb = 'application/x-ruby',
|
672 |
+
rhtml = 'application/x-ruby',
|
673 |
+
rss = 'application/rss+xml',
|
674 |
+
sh = 'text/x-shellscript',
|
675 |
+
tab = 'text/tab-separated-values',
|
676 |
+
vcf = 'text/x-vcard',
|
677 |
+
xml = 'application/xml'
|
678 |
+
}
|
679 |
+
|
680 |
+
function ext_tests(ext)
|
681 |
+
local mimetype = ext_map[ext]
|
682 |
+
if mimetype then
|
683 |
+
return mimetype
|
684 |
+
end
|
685 |
+
return 'text/plain'
|
686 |
+
end
|
687 |
+
|
688 |
+
|
689 |
+
local _M = {}
|
690 |
+
|
691 |
+
|
692 |
+
function _M.via_path(path, filename)
|
693 |
+
local f, err = io.open(path, 'r')
|
694 |
+
if not f then
|
695 |
+
return nil, err
|
696 |
+
end
|
697 |
+
|
698 |
+
local content = f:read(4096)
|
699 |
+
f:close()
|
700 |
+
|
701 |
+
if not filename then
|
702 |
+
filename = basename(path)
|
703 |
+
end
|
704 |
+
|
705 |
+
return _M.via_content(content, filename)
|
706 |
+
end
|
707 |
+
|
708 |
+
|
709 |
+
function _M.via_content(content, filename)
|
710 |
+
local ext = extension(filename)
|
711 |
+
|
712 |
+
-- If there are no low ASCII chars and no easily distinguishable tokens,
|
713 |
+
-- we need to detect by file extension
|
714 |
+
|
715 |
+
local mimetype = nil
|
716 |
+
|
717 |
+
mimetype = binary_tests(content, ext)
|
718 |
+
if mimetype then
|
719 |
+
return mimetype
|
720 |
+
end
|
721 |
+
|
722 |
+
-- Binary-looking files should have been detected so far
|
723 |
+
if content:find('[%z\x01-\x08\x0B\x0C\x0E-\x1F]') then
|
724 |
+
return 'application/octet-stream'
|
725 |
+
end
|
726 |
+
|
727 |
+
mimetype = text_tests(content)
|
728 |
+
if mimetype then
|
729 |
+
return mimetype
|
730 |
+
end
|
731 |
+
|
732 |
+
return ext_tests(ext)
|
733 |
+
end
|
734 |
+
|
735 |
+
return _M
|
src/_extensions/shafayetShafee/downloadthis/resources/css/downloadthis.css
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.downloadthis:focus,
|
2 |
+
.downloadthis:active {
|
3 |
+
box-shadow: none !important;
|
4 |
+
}
|
5 |
+
|
6 |
+
.downloadthis:hover {
|
7 |
+
transition: 0.2s;
|
8 |
+
filter: brightness(0.90);
|
9 |
+
}
|
10 |
+
|
11 |
+
.downloadthis:active {
|
12 |
+
filter: brightness(0.80);
|
13 |
+
}
|
src/assets/App_Market_Scraping.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/assets/buergergeld_forum.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae6e5db4a58dcfa7343cbcfed582e1e12d635e747834f283db2c8ee6099a8d69
|
3 |
+
size 1695491
|