Commit 8d17acbe by Dave Foote

saftey while I add some graphs/write a bit

parent f24bee12
......@@ -30,8 +30,10 @@ def rolling_window_splitter(df, date_col, window, features):
splits df into 6 month periods based on a column
window is in months
'''
features.append('Y')
features.append('date_posted')
df = df.sort_values('date_posted')
df = df.loc[:,features + [date_col]]
df = df.loc[:,features]
start = pd.Timestamp(df.iloc[0][date_col])
next_edge = pd.Timestamp(add_months(start, window))
end = pd.Timestamp(df.iloc[-1][date_col])
......@@ -42,10 +44,16 @@ def rolling_window_splitter(df, date_col, window, features):
start = next_edge
next_edge = pd.Timestamp(add_months(start, window))
rv.append(df.loc[df[date_col] > start])
features.pop()
features.pop()
return rv
def x_y_split(df):
return df.iloc[:,:-2], df.iloc[:,-1]
def x_y_split(data):
y = data.Y
x = data.drop('Y', axis=1)
return x, y
def convert_with_format(df, column_name):
return pd.to_datetime(df[column_name], format='%m/%d/%y')
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -5,7 +5,6 @@ General set of Pipeline Tools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree
import sklearn.tree as tree
from sklearn import preprocessing
from sklearn.svm import LinearSVC
......@@ -15,12 +14,15 @@ from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from scipy import stats
from plotnine import *
import graphviz
import donation_analysis as don
import pylab as pl
'''
Data Cleaning
......@@ -71,7 +73,7 @@ def encode_column(col):
return le.transform(col)
def x_y_split(df):
return df.iloc[:,:-1], df.iloc[:,-1]
return df.drop('date_posted'), df.iloc[:,-1]
'''
Classifiers
......@@ -87,9 +89,9 @@ def svm_classifier(x_train, y_train, x_test):
svm.fit(x_train, y_train)
confidence = svm.decision_function(x_test)
return confidence
return list(confidence)
def logreg_classifier(x_train, y_train, x_test, c, p):
def logreg_classifier(x_train, y_train, x_test, c, p, solver='lbfgs'):
'''
fit a logistic regression model and return predicted probabilities
'''
......@@ -118,7 +120,7 @@ def dectree_classifier(x_train, y_train, x_test, crit):
min_samples_split=lim)
dec_tree.fit(x_train, y_train)
return dec_tree.predict_proba(x_test)[:1]
return dec_tree.predict_proba(x_test)[:,0]
def random_forest_classifier(n_trees, max_features, x_train, y_train,
x_test):
......@@ -130,7 +132,7 @@ def random_forest_classifier(n_trees, max_features, x_train, y_train,
max_features=max_features, random_state=0)
rfc.fit(x_train, y_train)
return rfc.predict_proba(x_test)
return list(rfc.predict_proba(x_test)[:,1])
def gradient_boosting_classifier(x_train, y_train, x_test):
'''
......@@ -141,7 +143,7 @@ def gradient_boosting_classifier(x_train, y_train, x_test):
sgbc = GradientBoostingClassifier()
sgbc.fit(x_train, y_train)
return sgbc.predict_proba(x_test)
return sgbc.predict_proba(x_test)[:,1]
def bagging_classifier(x_train, y_train, x_test):
'''
......@@ -151,15 +153,56 @@ def bagging_classifier(x_train, y_train, x_test):
'''
bc = BaggingClassifier(max_features=.75)
bc.fit(x_train, y_train)
return bc.predict_proba(x_test)
return bc.predict_proba(x_test)[:,0]
'''
Eval Main Functions
'''
classifiers = ['logreg', 'knn', 'dectree', 'svm', 'RandomForest', 'Boosting',
'Bagging']
thresh = [.01, .02, .05, .1, .2, .3, .5]
#included .01 and 1.0 to gather precision/recall scores on full inclusion/
#exclusion
def big_board(windows, classifiers=classifiers,thresh=thresh):
'''
runs all of the models you tell it to and creates master stats board for
all models
'''
rv = []
rdf = pd.DataFrame()
for i, period in enumerate(windows[:-1]):
if i > 0:
a, b = don.x_y_split(period)
x = pd.concat([x, a], ignore_index=True, sort=False)
y = pd.concat([y, b], ignore_index=True, sort=False)
else:
x, y = don.x_y_split(period)
x = x.drop('date_posted', axis=1)
xt, yt = don.x_y_split(windows[i+1])
xt = xt.drop('date_posted', axis=1)
if 'logreg' in classifiers:
rv.append(evaluate_logreg(x,y,xt,yt))
if 'knn' in classifiers:
rv.append(evaluate_knn(x,y,xt,yt))
if 'dectree' in classifiers:
rv.append(evaluate_dectree(x,y,xt,yt))
if 'svm' in classifiers:
rv.append(evaluate_svm(x,y,xt,yt))
if 'RandomForest' in classifiers:
rv.append(evaluate_rf(x,y,xt,yt))
if 'Boosting' in classifiers:
rv.append(evaluate_gb(x,y,xt,yt))
if 'Bagging' in classifiers:
rv.append(evaluate_bagging(x,y,xt,yt))
for df in rv:
rdf = pd.concat([rdf, df], ignore_index=True, sort=False)
rdf.to_csv('results.csv')
def evaluate_logreg(x_train, y_train, x_test, y_test,
c_values=[.01,.1,1,10,100], thresh=thresh):
......@@ -167,14 +210,15 @@ def evaluate_logreg(x_train, y_train, x_test, y_test,
generates df of predictions, penalties, c_values, thresholds, precision, recall, and
accuracy of logistic regression
'''
penalties = ['l1', 'l2']
penalties = ['l2']
rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [],
'precision': [], 'recall': [], 'accuracy':[]}
'precision': [], 'recall': [], 'accuracy':[], 'class': []}
for p in penalties:
for c in c_values:
scores = logreg_classifier(x_train, y_train, x_test, c, p)
for t in thresh:
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t)for x in scores]
rd['predicted'].append(preds)
rd['penalty'].append(p)
......@@ -183,6 +227,7 @@ def evaluate_logreg(x_train, y_train, x_test, y_test,
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('logreg')
return pd.DataFrame(rd)
......@@ -193,10 +238,11 @@ def evaluate_knn(x_train, y_train, x_test, y_test, kays=[3,5,7,9,11],
recall, and accuracy to help find best model
'''
rd = {'predicted': [], 'k':[], 'threshold': [],
'precision': [], 'recall': [], 'accuracy':[]}
'precision': [], 'recall': [], 'accuracy':[], 'class': []}
for k in kays:
scores = knn_classifier(x_train, y_train, x_test, k)
scores = knn_classifier(x_train, y_train, x_test, k)[:,1]
for t in thresh:
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t) for x in scores]
rd['predicted'].append(preds)
rd['k'].append(k)
......@@ -204,7 +250,8 @@ def evaluate_knn(x_train, y_train, x_test, y_test, kays=[3,5,7,9,11],
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('knn')
return pd.DataFrame(rd)
def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
......@@ -213,65 +260,89 @@ def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
'''
criterion = ['entropy', 'gini']
rd = {'predicted': [], 'crit': [], 'threshold': [],
'precision': [], 'recall': [], 'accuracy':[]}
'precision': [], 'recall': [], 'accuracy':[], 'class': []}
for c in criterion:
scores = dectree_classifier(x_train, y_train, x_test, c)
for t in thresh:
preds = [compare_to_threshold(x, t) for x in scores]
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t) for x in list(scores)]
rd['predicted'].append(preds)
rd['crit'].append(c)
rd['threshold'].append(t)
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('dectree')
return pd.DataFrame(rd)
def evaluate_rf(x_train, y_train, x_test, y_test, thresh=thresh, ntrees=[25,100,500],
maxfeats=[1, .5, 1.0]):
maxfeats=[1, .5, 4]):
rd = {'predicted': [], 'ntrees':[], 'nfeats': [], 'threshold': [],
'precision': [], 'recall': [], 'accuracy':[]}
'precision': [], 'recall': [], 'accuracy':[], 'class': []}
for size in ntrees:
for f in maxfeats:
scores = random_forest_classifier(size, f, x_train, y_train, x_test)
for t in thresh:
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t) for x in scores]
rd['predicted'].append(preds)
rd['ntrees'].append(size)
rd['maxfeats'].append(f)
rd['nfeats'].append(f)
rd['threshold'].append(t)
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('rf')
return pd.DataFrame(rd)
def evaluate_gb(x_train, y_train, x_test, y_test, thresh=thresh):
rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [],
'accuracy':[]}
scores = gradient_boosting_classifier(x_train, y_train, x_test)
'accuracy':[], 'class': []}
scores = list(gradient_boosting_classifier(x_train, y_train, x_test))
for t in thresh:
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t) for x in scores]
rd['predicted'].append(preds)
rd['threshold'].append(t)
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('gb')
return pd.DataFrame(rd)
def evaluate_svm(x_train, y_train, x_test, y_test, thresh=thresh):
rd = {'predicted': [],'threshold': [], 'precision': [], 'recall': [],
'accuracy':[], 'class': []}
scores = svm_classifier(x_train, y_train, x_test)
for t in thresh:
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t) for x in scores]
rd['predicted'].append(preds)
rd['threshold'].append(t)
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('svm')
return pd.DataFrame(rd)
def evaluate_bagging(x_train, y_train, x_test, y_test, thresh=thresh):
rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [],
'accuracy':[]}
'accuracy':[], 'class': []}
scores = bagging_classifier(x_train, y_train, x_test)
for t in thresh:
preds = [compare_to_threshold(x, t) for x in scores]
scores = list(stats.rankdata(scores, 'average')/len(scores))
preds = [compare_to_threshold(x, t) for x in list(scores)]
rd['predicted'].append(preds)
rd['threshold'].append(t)
rd['precision'].append(precision(y_test, preds))
rd['recall'].append(recall(y_test, preds))
rd['accuracy'].append(accuracy(y_test, preds))
rd['class'].append('bagging')
return pd.DataFrame(rd)
......@@ -298,6 +369,29 @@ def compare_to_threshold(score, threshold):
'''
Visualize Data
'''
def plot_roc(name, probs, true, output_type):
'''
professor's plot roc function from github magicloops
output_types: 'show', 'save'
'''
fpr, tpr, thresholds = roc_curve(true, probs)
roc_auc = auc(fpr, tpr)
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.05])
pl.ylim([0.0, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title(name)
pl.legend(loc="lower right")
if (output_type == 'save'):
plt.savefig(name)
elif (output_type == 'show'):
plt.show()
else:
plt.show()
def plot_precision_recall_curve(y_test, pred_probas):
'''
sklearn documentation says y_true goes first but this disagrees with our
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment