File size: 3,196 Bytes
198e82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from flask import Flask, request, jsonify
from langchain_community.llms import LlamaCpp
import os 
app = Flask(__name__)

n_gpu_layers = 0
n_batch = 1024


llm = LlamaCpp(
    model_path="Phi-3-mini-4k-instruct-q4.gguf",  # path to GGUF file
    temperature=0.1,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    verbose=True,
    n_ctx=4096
)
file_size = os.stat('Phi-3-mini-4k-instruct-q4.gguf')
print("model size ====> :", file_size.st_size, "bytes")


@app.route('/', methods=['POST'])
def get_skills():
    cv_body = request.json.get('cv_body')

    # Simple inference example
    output = llm(
        f"<|user|>\n{cv_body}<|end|>\n<|assistant|>Can you list the skills mentioned in the CV?<|end|>",
        max_tokens=256,  # Generate up to 256 tokens
        stop=["<|end|>"], 
        echo=True,  # Whether to echo the prompt
    )

    return jsonify({'skills': output})

if __name__ == '__main__':
    app.run()
from flask import Flask, request, jsonify
import nltk
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import io
import base64

nltk.download('punkt')

app = Flask(__name__)

texts = [
    "This is a sample text.",
    "Another example of text.",
    "More texts to compare."
]

tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

word_embeddings_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

def text_embedding(text):
    words = nltk.word_tokenize(text.lower())
    embeddings = [word_embeddings_model.wv[word] for word in words if word in word_embeddings_model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word_embeddings_model.vector_size)

@app.route('/process', methods=['POST'])
def process():
    data = request.get_json()
    input_text = data.get('input_text', '')

    if not input_text:
        return jsonify({'error': 'No input text provided'}), 400

    input_embedding = text_embedding(input_text)
    text_embeddings = [text_embedding(text) for text in texts]

    similarities = cosine_similarity([input_embedding], text_embeddings).flatten()
    similarities_percentages = [similarity * 100 for similarity in similarities]

    fig, ax = plt.subplots(figsize=(10, 6))
    texts_for_plotting = [f"Text {i+1}" for i in range(len(texts))]
    ax.bar(texts_for_plotting, similarities_percentages)
    ax.set_ylabel('Similarity (%)')
    ax.set_xlabel('Texts')
    ax.set_title('Similarity of Input Text with other texts')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    plt.close()

    sorted_indices = np.argsort(similarities)[::-1]
    similar_texts = [(similarities[idx] * 100, texts[idx]) for idx in sorted_indices[:3]]

    response = {
        'similarities': similarities_percentages,
        'plot': img_base64,
        'most_similar_texts': similar_texts
    }

    return jsonify(response)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080, debug=True)