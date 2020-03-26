Discover, triage, and prioritize Python errors in real-time
k-NN similarity search is powered by Open Distro for Elasticsearch, an Apache 2.0-licensed distribution of Elasticsearch.
%%sh
mkdir model
sudo mount -t nfs \
-o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 \
fs-xxxxxx.efs.ap-southeast-2.amazonaws.com:/ \
./model
is DNS name of EFS.
fs-xxxxx.efs.ap-southeast-2.amazonaws.com
from sentence_transformers import models, losses, SentenceTransformer
word_embedding_model = models.DistilBERT('distilbert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
# reduce dim from 768 to 256
dense_model = models.Dense(in_features=768, out_features=256)
transformer = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])
transformer.save("model/transformer-v1/")
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.dataset_download_files("quora/question-pairs-dataset", path='quora_dataset', unzip=True)
import pandas as pd
pd.set_option('display.max_colwidth', -1)
df = pd.read_csv("quora_dataset/questions.csv", usecols=["qid1", "question1"], index_col=False)
df = df.sample(frac=1).reset_index(drop=True)
df_questions_imp = df[:5000]
import boto3
from requests_aws4auth import AWS4Auth
from elasticsearch import Elasticsearch, RequestsHttpConnection
region = 'ap-southeast-2'
service = 'es'
ssm = boto3.client('ssm', region_name=region)
es_parameter = ssm.get_parameter(Name='/KNNSearch/ESUrl')
es_host = es_parameter['Parameter']['Value']
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
region, service, session_token=credentials.token)
es = Elasticsearch(
hosts=[{'host': es_host, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection
)
knn_index = {
"settings": {
"index.knn": True
},
"mappings": {
"properties": {
"question_vector": {
"type": "knn_vector",
"dimension": 256
}
}
}
}
es.indices.create(index="questions",body=knn_index,ignore=400)
def es_import(df):
for index, row in df.iterrows():
vectors = local_transformer.encode([row["question1"]])
es.index(index='questions',
id=row["qid1"],
body={"question_vector": vectors[0].tolist(),
"question": row["question1"]})
es_import(df_questions_imp)
{'question_vector': [-0.06435434520244598, ... ,0.0726890116930008],
'question': 'How hard is it to learn to play piano as an adult?'}
import json
import boto3
from flask import Flask
from flask_restful import reqparse, Resource, Api
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
from sentence_transformers import SentenceTransformer
app = Flask(__name__)
api = Api(app)
region = 'ap-southeast-2'
ssm = boto3.client('ssm', region_name=region)
es_parameter = ssm.get_parameter(Name='/KNNSearch/ESUrl')
host = es_parameter['Parameter']['Value']
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
region, service, session_token=credentials.token)
parser = reqparse.RequestParser()
parser.add_argument('question')
parser.add_argument('size')
parser.add_argument('min_score')
es = Elasticsearch(
hosts=[{'host': host, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection
)
transform_model = SentenceTransformer(
'model/transformer-v1/')
class SimilarQuestionList(Resource):
def post(self):
args = parser.parse_args()
sentence_embeddings = transform_model.encode([args
["question"]])
res = es.search(index="questions",
body={
"size": args.get("size", 5),
"_source": {
"exclude": ["question_vector"]
},
"min_score": args.get("min_score", 0.3),
"query": {
"knn": {
"question_vector": {
"vector": sentence_embeddings[0].tolist(),
"k": args.get("size", 5)
}
}
}
})
return res, 201
api.add_resource(SimilarQuestionList, '/search')
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=8000)
$curl --data 'question=What is best way to make money online?' --data 'size=5' --data 'min_score=0.3' -X POST http://knn-s-publi-xxxx-207238135.ap-southeast-2.elb.amazonaws.com/search
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 15,
"relation": "eq"
},
"max_score": 0.69955945,
"hits": [
{
"_index": "questions",
"_type": "_doc",
"_id": "210905",
"_score": 0.69955945,
"_source": {
"question": "What is an easy way make money online?"
}
},
{
"_index": "questions",
"_type": "_doc",
"_id": "547612",
"_score": 0.61820024,
"_source": {
"question": "What is the best way to make passive income online?"
}
},
{
"_index": "questions",
"_type": "_doc",
"_id": "1891",
"_score": 0.5624176,
"_source": {
"question": "What are the easy ways to earn money online?"
}
},
{
"_index": "questions",
"_type": "_doc",
"_id": "197580",
"_score": 0.46031988,
"_source": {
"question": "What is the best way to download YouTube videos for free?"
}
},
{
"_index": "questions",
"_type": "_doc",
"_id": "359930",
"_score": 0.45543614,
"_source": {
"question": "What is the best way to get traffic on your website?"
}
}
]
}
}