Spaces:
Running
Running
Commit
·
abbf29d
0
Parent(s):
Duplicate from keturn/INED-datasette
Browse filesCo-authored-by: Kevin Turner <[email protected]>
- .gitattributes +34 -0
- Dockerfile +31 -0
- README.md +11 -0
- metadata.json +14 -0
- settings.json +3 -0
- src/import-git.sh +16 -0
- src/textdir2sql/loading.py +90 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM datasetteproject/datasette:0.64.1
|
2 |
+
|
3 |
+
# huggingface spaces run as user 1000
|
4 |
+
RUN adduser hf-space --uid 1000 --disabled-password --gecos '' && \
|
5 |
+
mkdir /home/hf-space/app && \
|
6 |
+
chown hf-space: /home/hf-space/app
|
7 |
+
WORKDIR /home/hf-space/app
|
8 |
+
|
9 |
+
RUN datasette install datasette-configure-fts && \
|
10 |
+
datasette install datasette-render-image-tags
|
11 |
+
|
12 |
+
RUN apt-get update && \
|
13 |
+
apt-get install -y --no-install-recommends git && \
|
14 |
+
apt-get clean && \
|
15 |
+
rm -rf /var/lib/apt && \
|
16 |
+
rm -rf /var/lib/dpkg/info/*
|
17 |
+
|
18 |
+
USER hf-space
|
19 |
+
|
20 |
+
# spaces default port
|
21 |
+
EXPOSE 7860
|
22 |
+
ENTRYPOINT ["datasette", "--host=0.0.0.0", "--port=7860"]
|
23 |
+
CMD ["."]
|
24 |
+
|
25 |
+
ENV PYTHONPATH=/home/hf-space/app/src/
|
26 |
+
|
27 |
+
COPY src src
|
28 |
+
COPY metadata.json settings.json ./
|
29 |
+
|
30 |
+
RUN src/import-git.sh && \
|
31 |
+
datasette inspect *.db --inspect-file=inspect-data.json
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: INED Datasette
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: pink
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
duplicated_from: keturn/INED-datasette
|
9 |
+
---
|
10 |
+
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
metadata.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"title": "Imaginary Network Expanded Dataset",
|
3 |
+
"description": "Curated by Sygil",
|
4 |
+
"source_url": "https://github.com/Sygil-Dev/INE-dataset",
|
5 |
+
"databases": {
|
6 |
+
"INE": {
|
7 |
+
"tables": {
|
8 |
+
"images": {
|
9 |
+
"fts_table": "captions_fts"
|
10 |
+
}
|
11 |
+
}
|
12 |
+
}
|
13 |
+
}
|
14 |
+
}
|
settings.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"default_page_size": 20
|
3 |
+
}
|
src/import-git.sh
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -x -e -o pipefail
|
3 |
+
|
4 |
+
REPO="https://github.com/Sygil-Dev/INE-dataset.git"
|
5 |
+
IMAGE_HOST="https://raw.githubusercontent.com/Sygil-Dev/INE-dataset/main/data/"
|
6 |
+
|
7 |
+
# avoid cloning all the image files
|
8 |
+
git clone --no-checkout --filter=blob:none --depth 1 "${REPO}" dataset
|
9 |
+
|
10 |
+
# Beware `--no-cone` is deprecated, so this may stop working someday
|
11 |
+
# https://git-scm.com/docs/git-sparse-checkout#_internalsnon_cone_problems
|
12 |
+
git -C dataset sparse-checkout set --no-cone '/data/*.txt'
|
13 |
+
git -C dataset checkout main
|
14 |
+
|
15 |
+
python3 -m textdir2sql.loading dataset/data INE.db \
|
16 |
+
--image-host="${IMAGE_HOST}"
|
src/textdir2sql/loading.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3
|
2 |
+
from functools import partial
|
3 |
+
from itertools import islice
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import click
|
7 |
+
|
8 |
+
BATCH_SIZE=1024
|
9 |
+
|
10 |
+
|
11 |
+
@click.command()
|
12 |
+
@click.argument('input_dir', type=click.Path(exists=True, file_okay=False, path_type=Path))
|
13 |
+
@click.argument('output', type=click.Path(dir_okay=False, writable=True, path_type=Path))
|
14 |
+
@click.option('--image-host', help="base URL of images")
|
15 |
+
@click.option('--explicit/--no-explicit', default=False)
|
16 |
+
def main(input_dir: Path, output: Path, image_host: str, explicit:bool):
|
17 |
+
connection = sqlite3.connect(output)
|
18 |
+
try:
|
19 |
+
_main_with_connection(input_dir, connection, image_host, explicit)
|
20 |
+
finally:
|
21 |
+
connection.close()
|
22 |
+
|
23 |
+
def _main_with_connection(input_dir: Path, connection: sqlite3.Connection, image_host: str=None, explicit=False):
|
24 |
+
connection.execute("CREATE TABLE IF NOT EXISTS "
|
25 |
+
" captions(image_key text PRIMARY KEY, caption text NOT NULL);")
|
26 |
+
|
27 |
+
if image_host:
|
28 |
+
connection.execute(f"""
|
29 |
+
CREATE VIEW IF NOT EXISTS images AS
|
30 |
+
SELECT {sql_quote(connection, image_host)} || image_key || '.jpg' AS image,
|
31 |
+
caption,
|
32 |
+
rowid
|
33 |
+
FROM captions
|
34 |
+
""")
|
35 |
+
|
36 |
+
text_files = input_dir.glob("*.txt")
|
37 |
+
|
38 |
+
with click.progressbar(chunked(text_files, BATCH_SIZE)) as progress:
|
39 |
+
for batch in progress:
|
40 |
+
text_file: Path
|
41 |
+
pairs = ((text_file.stem, text_file.read_text())
|
42 |
+
for text_file in batch)
|
43 |
+
with connection:
|
44 |
+
connection.executemany("INSERT INTO captions(image_key, caption) "
|
45 |
+
"VALUES(?, ?) ", pairs)
|
46 |
+
|
47 |
+
if not explicit:
|
48 |
+
ratings = ["rating:unsafe", "rating:explicit", "rating:mature", "meta:nsfw",
|
49 |
+
"subreddit:%nsfw"]
|
50 |
+
for rating in ratings:
|
51 |
+
with connection:
|
52 |
+
c = connection.execute("DELETE FROM captions WHERE caption LIKE ?",
|
53 |
+
(f"%{rating}%",))
|
54 |
+
print(f"Removed {c.rowcount} {rating} rows")
|
55 |
+
|
56 |
+
with connection:
|
57 |
+
# Add full-text search index
|
58 |
+
connection.execute("""CREATE VIRTUAL TABLE
|
59 |
+
captions_fts USING
|
60 |
+
fts5(caption, image_key UNINDEXED, content=captions)
|
61 |
+
""")
|
62 |
+
connection.execute("""
|
63 |
+
INSERT INTO "captions_fts" (rowid, image_key, caption)
|
64 |
+
SELECT rowid, image_key, caption
|
65 |
+
FROM captions
|
66 |
+
""")
|
67 |
+
|
68 |
+
|
69 |
+
def chunked(iterable, n):
|
70 |
+
return iter(partial(take, n, iter(iterable)), [])
|
71 |
+
|
72 |
+
def take(n, iterable):
|
73 |
+
return list(islice(iterable, n))
|
74 |
+
|
75 |
+
def sql_quote(connection, value: str) -> str:
|
76 |
+
"""
|
77 |
+
Apply SQLite string quoting to a value, including wrapping it in single quotes.
|
78 |
+
:param value: String to quote
|
79 |
+
"""
|
80 |
+
# Normally we would use .execute(sql, [params]) for escaping, but
|
81 |
+
# occasionally that isn't available - most notable when we need
|
82 |
+
# to include a "... DEFAULT 'value'" in a column definition.
|
83 |
+
return connection.execute(
|
84 |
+
# Use SQLite itself to correctly escape this string:
|
85 |
+
"SELECT quote(:value)",
|
86 |
+
{"value": value},
|
87 |
+
).fetchone()[0]
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
main()
|