Files changed (1) hide show
  1. app.py +165 -4
app.py CHANGED
@@ -1,7 +1,168 @@
1
- from fastapi import FastAPI
 
 
 
 
 
 
 
 
2
 
3
  app = FastAPI()
4
 
5
- @app.get("/")
6
- def greet_json():
7
- return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import io
3
+ import re
4
+ import yaml
5
+ from typing import List, Optional
6
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Query
7
+ from fastapi.responses import JSONResponse
8
+ import uvicorn
9
+ from fastapi.middleware.cors import CORSMiddleware
10
 
11
  app = FastAPI()
12
 
13
+ # Carregar configuração
14
+ with open("column_config.yaml") as f:
15
+ COLUMN_CONFIG = yaml.safe_load(f)
16
+
17
+ # Função para detectar tipos de colunas
18
+ def detect_column_type(dtype):
19
+ if pd.api.types.is_datetime64_any_dtype(dtype):
20
+ return "datetime"
21
+ elif pd.api.types.is_numeric_dtype(dtype):
22
+ return "number"
23
+ return "text"
24
+
25
+ # Normalização de colunas
26
+ def normalize_column_names(column_names: List[str]) -> List[str]:
27
+ normalized = []
28
+ for raw_col in column_names:
29
+ sanitized = re.sub(r'[\W]+', '_', raw_col.strip()).lower().strip('_')
30
+ for config_col, config in COLUMN_CONFIG['columns'].items():
31
+ synonyms = [
32
+ re.sub(r'[\W]+', '_', s.strip()).lower().strip('_')
33
+ for s in [config_col] + config.get('synonyms', [])
34
+ ]
35
+ if sanitized in synonyms:
36
+ normalized.append(config_col)
37
+ break
38
+ else:
39
+ normalized.append(sanitized)
40
+ return normalized
41
+
42
+ # Limpeza de dados aprimorada
43
+ def clean_data(df: pd.DataFrame) -> pd.DataFrame:
44
+ df.columns = normalize_column_names(df.columns)
45
+
46
+ # Tratamento de valores ausentes
47
+ for col in df.columns:
48
+ if col in COLUMN_CONFIG['columns']:
49
+ col_type = COLUMN_CONFIG['columns'][col].get('type', 'text')
50
+ if col_type == 'datetime':
51
+ df[col] = pd.to_datetime(df[col], errors='coerce')
52
+ elif col_type == 'numeric':
53
+ df[col] = pd.to_numeric(df[col], errors='coerce')
54
+ elif col_type == 'categorical':
55
+ allowed = COLUMN_CONFIG['columns'][col].get('allowed', [])
56
+ df[col] = df[col].where(df[col].isin(allowed), None)
57
+
58
+ # Tratamento de formatos inconsistentes
59
+ for col in df.columns:
60
+ if col in COLUMN_CONFIG['columns']:
61
+ col_type = COLUMN_CONFIG['columns'][col].get('type', 'text')
62
+ if col_type == 'datetime':
63
+ fmt = COLUMN_CONFIG['columns'][col].get('format')
64
+ df[col] = pd.to_datetime(df[col], errors='coerce', format=fmt)
65
+ df[col] = df[col].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
66
+ elif col_type == 'numeric':
67
+ df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
68
+ elif col_type == 'categorical':
69
+ allowed = COLUMN_CONFIG['columns'][col].get('allowed', [])
70
+ df[col] = df[col].where(df[col].isin(allowed))
71
+
72
+ # Tratamento de outliers
73
+ for col in df.columns:
74
+ if col in COLUMN_CONFIG['columns']:
75
+ col_type = COLUMN_CONFIG['columns'][col].get('type', 'text')
76
+ if col_type == 'numeric':
77
+ q1 = df[col].quantile(0.25)
78
+ q3 = df[col].quantile(0.75)
79
+ iqr = q3 - q1
80
+ lower_bound = q1 - 1.5 * iqr
81
+ upper_bound = q3 + 1.5 * iqr
82
+ df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
83
+
84
+ # Tratamento de registros duplicados
85
+ df.drop_duplicates(inplace=True)
86
+
87
+ # Tratamento de tipos de dados mistos
88
+ for col in df.columns:
89
+ if col in COLUMN_CONFIG['columns']:
90
+ col_type = COLUMN_CONFIG['columns'][col].get('type', 'text')
91
+ if col_type == 'numeric':
92
+ df[col] = pd.to_numeric(df[col], errors='coerce')
93
+ elif col_type == 'datetime':
94
+ df[col] = pd.to_datetime(df[col], errors='coerce')
95
+
96
+ # Tratamento de dados ruídos
97
+ for col in df.columns:
98
+ if col in COLUMN_CONFIG['columns']:
99
+ col_type = COLUMN_CONFIG['columns'][col].get('type', 'text')
100
+ if col_type == 'text':
101
+ df[col] = df[col].str.strip().str.lower()
102
+
103
+ return df.replace({pd.NA: None})
104
+
105
+ # Função para processar o arquivo e retornar dados limpos
106
+ def process_file(file: UploadFile, sheet_name: Optional[str] = None) -> pd.DataFrame:
107
+ try:
108
+ content = file.file.read()
109
+ extension = file.filename.split('.')[-1]
110
+ if extension == 'csv':
111
+ df = pd.read_csv(io.BytesIO(content))
112
+ elif extension == 'xlsx':
113
+ if sheet_name is None:
114
+ sheet_name = 0 # Default to the first sheet
115
+ df = pd.read_excel(io.BytesIO(content), sheet_name=sheet_name)
116
+ else:
117
+ raise HTTPException(400, "Formato de arquivo não suportado")
118
+ return df, clean_data(df)
119
+ except Exception as e:
120
+ raise HTTPException(500, f"Erro ao processar o arquivo: {str(e)}")
121
+
122
+ # Endpoint para upload e processamento de arquivos
123
+ @app.post("/process-file")
124
+ async def process_file_endpoint(file: UploadFile = File(...), sheet_name: Optional[str] = Query(None)):
125
+ try:
126
+ raw_df, df = process_file(file, sheet_name)
127
+
128
+ columns = [{
129
+ "name": col,
130
+ "type": detect_column_type(df[col].dtype)
131
+ } for col in df.columns]
132
+
133
+ rows = []
134
+ for idx, row in df.iterrows():
135
+ cells = {}
136
+ for col, val in row.items():
137
+ cells[col] = {
138
+ "value": val,
139
+ "displayValue": str(val),
140
+ "columnId": col
141
+ }
142
+ rows.append({"id": str(idx), "cells": cells})
143
+
144
+ return JSONResponse(
145
+ content={
146
+ "data": {
147
+ "columns": columns,
148
+ "rows": rows
149
+ },
150
+ "metadata": {
151
+ "totalRows": len(df),
152
+ "processedAt": pd.Timestamp.now().isoformat()
153
+ }
154
+ })
155
+ except Exception as e:
156
+ raise HTTPException(500, f"Erro: {str(e)}")
157
+
158
+ # Configuração de CORS
159
+ app.add_middleware(
160
+ CORSMiddleware,
161
+ allow_origins=["*"],
162
+ allow_credentials=True,
163
+ allow_methods=["*"],
164
+ allow_headers=["*"],
165
+ )
166
+
167
+ if __name__ == "__main__":
168
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)