Serving Model
How to serve a model
Here we show how to:
- train a model
- save it to pickle
- load a model and serve it with
aiohttp
.
In order to keep it simple we are going to use a sklearn
model.
Getting data
Dataset train.csv
comes from kaggle: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge. We locally store in data
directory.
import pandas as pd
comments_df = pd.read_csv("data/toxic-comment-classification-challenge/train.csv")
comments_df.head(2)
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | Explanation\nWhy the edits made under my usern... | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 000103f0d9cfb60f | D'aww! He matches this background colour I'm s... | 0 | 0 | 0 | 0 | 0 | 0 |
Predict if comment is toxic
Train - validation split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = \
train_test_split(comments_df[['comment_text']], comments_df['toxic'], random_state=10)
X_train.head(2)
comment_text | |
---|---|
34852 | This is a straw man argument, Mr Merkey. Nobo... |
17133 | ARC Gritt, the fucking cunt of all cunts, ruin... |
Preprocessing and vectorizer
Let’s do some simple preprocessing.
import re
import nltk
from nltk.stem import SnowballStemmer
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS = "€\?"
GOOD_SYMBOLS_RE = re.compile('([' + GOOD_SYMBOLS + '])')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z '+ GOOD_SYMBOLS + ']')
ADD_SPACES_SYMBOLS_RE = re.compile("([\?])")
STEMMER = SnowballStemmer('english')
class TextPreprocessor:
def transfrom_text(self, text):
text = re.sub(GOOD_SYMBOLS_RE, r"\1", text) #process good symbols
text = text.lower()
text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # process bad symbols
text = re.sub(BAD_SYMBOLS_RE, "", text) # process bad symbols
text = re.sub(ADD_SPACES_SYMBOLS_RE, r" \1 ", text)
test = " ".join([STEMMER.stem(word) for word in text.split()])
return text
def transform(self, series):
return series.apply(lambda text: self.transfrom_text(text))
from sklearn.feature_extraction.text import TfidfVectorizer
class Vectorizer:
def __init__(self):
self.vectorizer = TfidfVectorizer(min_df=4, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
self.pickle_fn = "pickles/messaging_vectorizer.pickle"
def fit(self, column):
self.vectorizer.fit(column)
def transform(self, column):
return self.vectorizer.transform(column)
def dumps(self):
with open(self.pickle_fn, 'wb') as f:
pickle.dump(self.vectorizer, f, pickle.HIGHEST_PROTOCOL)
def load(self):
with open(self.pickle_fn, 'rb') as f:
self.vectorizer = pickle.load(f)
Model
import pickle
from sklearn.linear_model import LogisticRegression
class Model:
def __init__(self):
self.model = LogisticRegression(class_weight='balanced')
self.pickle_fn = "pickles/messaging_model.pickle"
def fit(self, X, y):
self.model.fit(X, y)
def predict(self, X):
return self.model.predict(X)
def dumps(self):
with open(self.pickle_fn, 'wb') as f:
pickle.dump(self.model, f, pickle.HIGHEST_PROTOCOL)
def load(self):
with open(self.pickle_fn, 'rb') as f:
self.model = pickle.load(f)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
average_precision_score, roc_auc_score, recall_score
def scores(y, predicted):
return {
'accuracy': accuracy_score(y, predicted),
'precision': precision_score(y, predicted),
'recall': recall_score(y, predicted),
'f1-score': f1_score(y, predicted),
"roc_auc": roc_auc_score(y, predicted),
'average-Precision': average_precision_score(y, predicted)}
Puting preprocessor, vectorizer and model together
class TfidfModelAll:
def __init__(self, colname="text"):
self.colname = colname
self.preprocessor = TextPreprocessor()
self.vectorizer = Vectorizer()
self.model = Model()
def fit_predict(self, X, y):
print("preprocessor...")
X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})
print("vectorizer...")
self.vectorizer.fit(X_fe[self.colname])
print("model...")
X_fe = self.vectorizer.transform(X[self.colname])
self.model.fit(X_fe, y)
return self.model.predict(X_fe)
def predict(self, X=None, message=None):
if message is not None:
X = pd.DataFrame({self.colname: [message]})
X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})
X_fe = self.vectorizer.transform(X_fe[self.colname])
return self.model.predict(X_fe)
def predict_message(self, message):
return self.predict(message=message)[0]
def dumps(self):
self.vectorizer.dumps()
self.model.dumps()
def load(self):
self.vectorizer.load()
self.model.load()
Train, validate it and save the model to a pickle
tfidf_model = TfidfModelAll("comment_text")
y_train_hat = tfidf_model.fit_predict(X_train, y_train)
scores(y_train, y_train_hat)
preprocessor...
vectorizer...
model...
{'accuracy': 0.953358177777035,
'average-Precision': 0.6641958777165086,
'f1-score': 0.8007424858999072,
'precision': 0.6819066147859922,
'recall': 0.969738889849559,
'roc_auc': 0.96067231602142}
y_val_hat = tfidf_model.predict(X_val)
scores(y_val, y_val_hat)
{'accuracy': 0.9331963001027749,
'average-Precision': 0.5239641176536308,
'f1-score': 0.7035265324285237,
'precision': 0.6010264208325413,
'recall': 0.848175965665236,
'roc_auc': 0.8950682123362819}
tfidf_model.dumps()
Serving model
Load model
tfidf_model2 = TfidfModelAll("comment_text")
tfidf_model2.load()
y_val_hat = tfidf_model2.predict(X_val)
scores(y_val, y_val_hat)
{'accuracy': 0.9331963001027749,
'average-Precision': 0.5239641176536308,
'f1-score': 0.7035265324285237,
'precision': 0.6010264208325413,
'recall': 0.848175965665236,
'roc_auc': 0.8950682123362819}
message = """All of my edits are good.
Cunts like you who revert good edits because you're too stupid to understand how to write well ,
and then revert other edits just because you've decided to bear a playground grudge, are the problem.
Maybe one day you'll realise the damage you did to a noble project. 201.215.187.159"""
tfidf_model2.predict_message(message=message)
1
Serve the model
import asyncio
asyncio.get_event_loop().close()
print(asyncio.get_event_loop().is_closed())
loop = asyncio.new_event_loop()
asyncio.set_event_loop(asyncio.new_event_loop())
True
from aiohttp import web
async def handler(request):
data = await request.post()
message = data.get("message")
prediction = tfidf_model2.predict_message(message=message)
return web.Response(text=str(prediction))
app = web.Application()
app.add_routes([web.post('/toxic', handler)])
web.run_app(app)
======== Running on http://0.0.0.0:8080 ========
(Press CTRL+C to quit)
Check how it work
From command line execute the following:
curl -X POST http://0.0.0.0:8080/toxic -d "message=All of my edits are good.
Cunts like you who revert good edits because you're too stupid to understand how to write well ,
and then revert other edits just because you've decided to bear a playground grudge, are the problem.
Maybe one day you'll realise the damage you did to a noble project. 201.215.187.159"
Thanks
I would like to thank Luca Cerone (http://www.lucacerone.net/) for helpfull conversation and suggesting usage of aiohttp
instead of flask
.