KevinHuSh commited on
Commit
cdd9565
·
1 Parent(s): 3fc700a

finish add thumbnail to video,image,pdf files (#18)

Browse files
Cargo.toml CHANGED
@@ -32,4 +32,4 @@ regex = "1.10.2"
32
  name = "doc_gpt"
33
 
34
  [workspace]
35
- members = [".", "migration"]
 
32
  name = "doc_gpt"
33
 
34
  [workspace]
35
+ members = [".", "migration"]
migration/src/m20220101_000001_create_table.rs CHANGED
@@ -201,7 +201,7 @@ impl MigrationTrait for Migration {
201
  .col(ColumnDef::new(DocInfo::Location).string().not_null())
202
  .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
203
  .col(ColumnDef::new(DocInfo::Type).string().not_null())
204
- .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().not_null())
205
  .comment("doc type|folder")
206
  .col(
207
  ColumnDef::new(DocInfo::CreatedAt)
@@ -274,28 +274,28 @@ impl MigrationTrait for Migration {
274
  .values_panic([
275
  (1).into(),
276
  "Video".into(),
277
- ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(),
278
  (1).into(),
279
  (1).into(),
280
  ])
281
  .values_panic([
282
  (1).into(),
283
  "Picture".into(),
284
- ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(),
285
  (2).into(),
286
  (2).into(),
287
  ])
288
  .values_panic([
289
  (1).into(),
290
  "Music".into(),
291
- ".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(),
292
  (3).into(),
293
  (3).into(),
294
  ])
295
  .values_panic([
296
  (1).into(),
297
  "Document".into(),
298
- ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(),
299
  (3).into(),
300
  (3).into(),
301
  ])
 
201
  .col(ColumnDef::new(DocInfo::Location).string().not_null())
202
  .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
203
  .col(ColumnDef::new(DocInfo::Type).string().not_null())
204
+ .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
205
  .comment("doc type|folder")
206
  .col(
207
  ColumnDef::new(DocInfo::CreatedAt)
 
274
  .values_panic([
275
  (1).into(),
276
  "Video".into(),
277
+ ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
278
  (1).into(),
279
  (1).into(),
280
  ])
281
  .values_panic([
282
  (1).into(),
283
  "Picture".into(),
284
+ ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
285
  (2).into(),
286
  (2).into(),
287
  ])
288
  .values_panic([
289
  (1).into(),
290
  "Music".into(),
291
+ ".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
292
  (3).into(),
293
  (3).into(),
294
  ])
295
  .values_panic([
296
  (1).into(),
297
  "Document".into(),
298
+ ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
299
  (3).into(),
300
  (3).into(),
301
  ])
python/svr/add_thumbnail2file.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, datetime, random, re, cv2
2
+ from os.path import dirname, realpath
3
+ sys.path.append(dirname(realpath(__file__)) + "/../")
4
+ from util.db_conn import Postgres
5
+ from util.minio_conn import HuMinio
6
+ from util import findMaxDt
7
+ import base64
8
+ from io import BytesIO
9
+ import pandas as pd
10
+ from PIL import Image
11
+ import pdfplumber
12
+
13
+
14
+ PG = Postgres("infiniflow", "docgpt")
15
+ MINIO = HuMinio("infiniflow")
16
+ def set_thumbnail(did, base64):
17
+ sql = f"""
18
+ update doc_info set thumbnail_base64='{base64}'
19
+ where
20
+ did={did}
21
+ """
22
+ PG.update(sql)
23
+
24
+
25
+ def collect(comm, mod, tm):
26
+ sql = f"""
27
+ select
28
+ did, uid, doc_name, location, updated_at
29
+ from doc_info
30
+ where
31
+ updated_at >= '{tm}'
32
+ and MOD(did, {comm}) = {mod}
33
+ and is_deleted=false
34
+ and type <> 'folder'
35
+ and thumbnail_base64=''
36
+ order by updated_at asc
37
+ limit 10
38
+ """
39
+ docs = PG.select(sql)
40
+ if len(docs) == 0:return pd.DataFrame()
41
+
42
+ mtm = str(docs["updated_at"].max())[:19]
43
+ print("TOTAL:", len(docs), "To: ", mtm)
44
+ return docs
45
+
46
+
47
+ def build(row):
48
+ if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
49
+ row["doc_name"].lower().strip()):
50
+ set_thumbnail(row["did"], "_")
51
+ return
52
+
53
+ def thumbnail(img, SIZE=128):
54
+ w,h = img.size
55
+ p = SIZE/max(w, h)
56
+ w, h = int(w*p), int(h*p)
57
+ img.thumbnail((w, h))
58
+ buffered = BytesIO()
59
+ try:
60
+ img.save(buffered, format="JPEG")
61
+ except Exception as e:
62
+ try:
63
+ img.save(buffered, format="PNG")
64
+ except Exception as ee:
65
+ pass
66
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
67
+
68
+
69
+ iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
70
+ if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
71
+ pdf = pdfplumber.open(iobytes)
72
+ img = pdf.pages[0].to_image().annotated
73
+ set_thumbnail(row["did"], thumbnail(img))
74
+
75
+ if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
76
+ img = Image.open(iobytes)
77
+ set_thumbnail(row["did"], thumbnail(img))
78
+
79
+ if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
80
+ url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
81
+ row["location"],
82
+ expires=datetime.timedelta(seconds=60)
83
+ )
84
+ cap = cv2.VideoCapture(url)
85
+ succ = cap.isOpened()
86
+ i = random.randint(1, 11)
87
+ while succ:
88
+ ret, frame = cap.read()
89
+ if not ret: break
90
+ if i > 0:
91
+ i -= 1
92
+ continue
93
+ img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
94
+ print(img.size)
95
+ set_thumbnail(row["did"], thumbnail(img))
96
+ cap.release()
97
+ cv2.destroyAllWindows()
98
+
99
+
100
+ def main(comm, mod):
101
+ global model
102
+ tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
103
+ tm = findMaxDt(tm_fnm)
104
+ rows = collect(comm, mod, tm)
105
+ if len(rows) == 0:return
106
+
107
+ tmf = open(tm_fnm, "a+")
108
+ for _, r in rows.iterrows():
109
+ build(r)
110
+ tmf.write(str(r["updated_at"]) + "\n")
111
+ tmf.close()
112
+
113
+
114
+ if __name__ == "__main__":
115
+ from mpi4py import MPI
116
+ comm = MPI.COMM_WORLD
117
+ main(comm.Get_size(), comm.Get_rank())
118
+
python/util/minio_conn.py CHANGED
@@ -54,11 +54,24 @@ class HuMinio(object):
54
  r = self.conn.get_object(bucket, fnm)
55
  return r.read()
56
  except Exception as e:
57
- logging.error(f"Fail get {bucket}/{fnm}: "+str(e))
58
  self.__open__()
59
  time.sleep(1)
60
  return
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if __name__ == "__main__":
63
  conn = HuMinio("infiniflow")
64
  fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
 
54
  r = self.conn.get_object(bucket, fnm)
55
  return r.read()
56
  except Exception as e:
57
+ logging.error(f"fail get {bucket}/{fnm}: "+str(e))
58
  self.__open__()
59
  time.sleep(1)
60
  return
61
 
62
+
63
+ def get_presigned_url(self, bucket, fnm, expires):
64
+ for _ in range(10):
65
+ try:
66
+ return self.conn.get_presigned_url("GET", bucket, fnm, expires)
67
+ except Exception as e:
68
+ logging.error(f"fail get {bucket}/{fnm}: "+str(e))
69
+ self.__open__()
70
+ time.sleep(1)
71
+ return
72
+
73
+
74
+
75
  if __name__ == "__main__":
76
  conn = HuMinio("infiniflow")
77
  fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
src/api/doc_info.rs CHANGED
@@ -1,6 +1,7 @@
1
- use std::collections::HashMap;
2
  use std::io::BufReader;
3
  use actix_multipart_extract::{ File, Multipart, MultipartForm };
 
4
  use actix_web::{ HttpResponse, post, web };
5
  use chrono::{ Utc, FixedOffset };
6
  use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
@@ -68,7 +69,7 @@ pub struct UploadForm {
68
  fn file_type(filename: &String) -> String {
69
  let fnm = filename.to_lowercase();
70
  if
71
- let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$")
72
  .unwrap()
73
  .captures(&fnm)
74
  {
@@ -76,7 +77,7 @@ fn file_type(filename: &String) -> String {
76
  }
77
  if
78
  let Some(_) = Regex::new(
79
- r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$"
80
  )
81
  .unwrap()
82
  .captures(&fnm)
@@ -84,14 +85,14 @@ fn file_type(filename: &String) -> String {
84
  return "Picture".to_owned();
85
  }
86
  if
87
- let Some(_) = Regex::new(r"\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)$")
88
  .unwrap()
89
  .captures(&fnm)
90
  {
91
  return "Music".to_owned();
92
  }
93
  if
94
- let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$")
95
  .unwrap()
96
  .captures(&fnm)
97
  {
@@ -100,6 +101,7 @@ fn file_type(filename: &String) -> String {
100
  "Other".to_owned()
101
  }
102
 
 
103
  #[post("/v1.0/upload")]
104
  async fn upload(
105
  payload: Multipart<UploadForm>,
 
1
+ use std::collections::{HashMap};
2
  use std::io::BufReader;
3
  use actix_multipart_extract::{ File, Multipart, MultipartForm };
4
+ use actix_web::web::Bytes;
5
  use actix_web::{ HttpResponse, post, web };
6
  use chrono::{ Utc, FixedOffset };
7
  use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
 
69
  fn file_type(filename: &String) -> String {
70
  let fnm = filename.to_lowercase();
71
  if
72
+ let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$")
73
  .unwrap()
74
  .captures(&fnm)
75
  {
 
77
  }
78
  if
79
  let Some(_) = Regex::new(
80
+ r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$"
81
  )
82
  .unwrap()
83
  .captures(&fnm)
 
85
  return "Picture".to_owned();
86
  }
87
  if
88
+ let Some(_) = Regex::new(r"\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$")
89
  .unwrap()
90
  .captures(&fnm)
91
  {
92
  return "Music".to_owned();
93
  }
94
  if
95
+ let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)$")
96
  .unwrap()
97
  .captures(&fnm)
98
  {
 
101
  "Other".to_owned()
102
  }
103
 
104
+
105
  #[post("/v1.0/upload")]
106
  async fn upload(
107
  payload: Multipart<UploadForm>,