saftey while I add some graphs/write a bit

8d17acbe · Dave Foote · f24bee12 · 8d17acbe · 8d17acbe · 8d17acbe
Commit 8d17acbe authored May 02, 2019 by Dave Foote
Showing with 130 additions and 28 deletions
hw3/best.png
hw3/best_dt.png
hw3/best_knn.png
hw3/best_logreg.png
hw3/best_rf.png
hw3/best_svm.png
hw3/donation_analysis.py
hw3/donors_choose.ipynb
hw3/pipe_lib.py
hw3/results.csv
--- a/hw3/best.png
+++ b/hw3/best.png
--- a/hw3/best_dt.png
+++ b/hw3/best_dt.png
--- a/hw3/best_knn.png
+++ b/hw3/best_knn.png
--- a/hw3/best_logreg.png
+++ b/hw3/best_logreg.png
--- a/hw3/best_rf.png
+++ b/hw3/best_rf.png
--- a/hw3/best_svm.png
+++ b/hw3/best_svm.png
--- a/hw3/donation_analysis.py
+++ b/hw3/donation_analysis.py
@@ -30,8 +30,10 @@ def rolling_window_splitter(df, date_col, window, features):
    splits df into 6 month periods based on a column
    window is in months
    '''
+    features.append('Y')
+    features.append('date_posted')
    df = df.sort_values('date_posted')
-    df = df.loc[:,features + [date_col]]
+    df = df.loc[:,features]
    start = pd.Timestamp(df.iloc[0][date_col])
    next_edge = pd.Timestamp(add_months(start, window))
    end = pd.Timestamp(df.iloc[-1][date_col])
@@ -42,10 +44,16 @@ def rolling_window_splitter(df, date_col, window, features):
        start = next_edge
        next_edge = pd.Timestamp(add_months(start, window))
        
+    rv.append(df.loc[df[date_col] > start])
+    features.pop()
+    features.pop()
+        
    return rv

-def x_y_split(df):
-    return df.iloc[:,:-2], df.iloc[:,-1]
+def x_y_split(data):
+    y = data.Y
+    x = data.drop('Y', axis=1)
+    return x, y

 def convert_with_format(df, column_name):
    return pd.to_datetime(df[column_name], format='%m/%d/%y')

--- a/hw3/donors_choose.ipynb
+++ b/hw3/donors_choose.ipynb
--- a/hw3/pipe_lib.py
+++ b/hw3/pipe_lib.py
@@ -5,7 +5,6 @@ General set of Pipeline Tools
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from sklearn import tree
 import sklearn.tree as tree
 from sklearn import preprocessing
 from sklearn.svm import LinearSVC
@@ -15,12 +14,15 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score as accuracy
 from sklearn.metrics import precision_score as precision
 from sklearn.metrics import recall_score as recall
-from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import precision_recall_curve, roc_curve, auc
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
 from sklearn.ensemble import BaggingClassifier
+from scipy import stats
 from plotnine import *
 import graphviz
+import donation_analysis as don
+import pylab as pl

 '''
 Data Cleaning
@@ -71,7 +73,7 @@ def encode_column(col):
    return le.transform(col)

 def x_y_split(df):
-    return df.iloc[:,:-1], df.iloc[:,-1]
+    return df.drop('date_posted'), df.iloc[:,-1]

 '''
 Classifiers
@@ -87,9 +89,9 @@ def svm_classifier(x_train, y_train, x_test):
    svm.fit(x_train, y_train)
    confidence = svm.decision_function(x_test)
    
-    return confidence
+    return list(confidence)

-def logreg_classifier(x_train, y_train, x_test, c, p):
+def logreg_classifier(x_train, y_train, x_test, c, p, solver='lbfgs'):
    '''
    fit a logistic regression model and return predicted probabilities
    '''
@@ -118,7 +120,7 @@ def dectree_classifier(x_train, y_train, x_test, crit):
                                      min_samples_split=lim)
    dec_tree.fit(x_train, y_train)
    
-    return dec_tree.predict_proba(x_test)[:1]
+    return dec_tree.predict_proba(x_test)[:,0]

 def random_forest_classifier(n_trees, max_features, x_train, y_train,
                            x_test):
@@ -130,7 +132,7 @@ def random_forest_classifier(n_trees, max_features, x_train, y_train,
                                 max_features=max_features, random_state=0)
    rfc.fit(x_train, y_train)
    
-    return rfc.predict_proba(x_test)
+    return list(rfc.predict_proba(x_test)[:,1])

 def gradient_boosting_classifier(x_train, y_train, x_test):
    '''
@@ -141,7 +143,7 @@ def gradient_boosting_classifier(x_train, y_train, x_test):
    sgbc = GradientBoostingClassifier()
    sgbc.fit(x_train, y_train)
    
-    return sgbc.predict_proba(x_test)
+    return sgbc.predict_proba(x_test)[:,1]

 def bagging_classifier(x_train, y_train, x_test):
    '''
@@ -151,15 +153,56 @@ def bagging_classifier(x_train, y_train, x_test):
    '''
    bc = BaggingClassifier(max_features=.75)
    bc.fit(x_train, y_train)
-
-    return bc.predict_proba(x_test)
+    return bc.predict_proba(x_test)[:,0]
    
 '''
 Eval Main Functions
 '''
+classifiers = ['logreg', 'knn', 'dectree', 'svm', 'RandomForest', 'Boosting',
+              'Bagging']
 thresh = [.01, .02, .05, .1, .2, .3, .5]
 #included .01 and 1.0 to gather precision/recall scores on full inclusion/
 #exclusion
+def big_board(windows, classifiers=classifiers,thresh=thresh):
+    '''
+    runs all of the models you tell it to and creates master stats board for
+    all models
+    '''
+    rv = []
+    rdf = pd.DataFrame()
+    
+    for i, period in enumerate(windows[:-1]):
+        if i > 0:
+            a, b = don.x_y_split(period)
+            x = pd.concat([x, a], ignore_index=True, sort=False)
+            y = pd.concat([y, b], ignore_index=True, sort=False)
+        else:
+            x, y = don.x_y_split(period)
+        x = x.drop('date_posted', axis=1)
+        xt, yt = don.x_y_split(windows[i+1])
+        xt = xt.drop('date_posted', axis=1)
+        if 'logreg' in classifiers:
+            rv.append(evaluate_logreg(x,y,xt,yt))
+        if 'knn' in classifiers:
+            rv.append(evaluate_knn(x,y,xt,yt))
+        if 'dectree' in classifiers:
+            rv.append(evaluate_dectree(x,y,xt,yt))
+        if 'svm' in classifiers:
+            rv.append(evaluate_svm(x,y,xt,yt))
+        if 'RandomForest' in classifiers:
+            rv.append(evaluate_rf(x,y,xt,yt))
+        if 'Boosting' in classifiers:
+            rv.append(evaluate_gb(x,y,xt,yt))
+        if 'Bagging' in classifiers:
+            rv.append(evaluate_bagging(x,y,xt,yt))
+            
+    for df in rv:
+        rdf = pd.concat([rdf, df], ignore_index=True, sort=False)
+        
+    rdf.to_csv('results.csv')
+            
+        
+        

 def evaluate_logreg(x_train, y_train, x_test, y_test,
                    c_values=[.01,.1,1,10,100], thresh=thresh):
@@ -167,14 +210,15 @@ def evaluate_logreg(x_train, y_train, x_test, y_test,
    generates df of predictions, penalties, c_values, thresholds, precision, recall, and
    accuracy of logistic regression
    '''
-    penalties = ['l1', 'l2']
+    penalties = ['l2']
    rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [],
-          'precision': [], 'recall': [], 'accuracy':[]}
+          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    
    for p in penalties:
        for c in c_values:
            scores = logreg_classifier(x_train, y_train, x_test, c, p)
            for t in thresh:
+                scores = list(stats.rankdata(scores, 'average')/len(scores))
                preds = [compare_to_threshold(x, t)for x in scores]
                rd['predicted'].append(preds)
                rd['penalty'].append(p)
@@ -183,6 +227,7 @@ def evaluate_logreg(x_train, y_train, x_test, y_test,
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
+                rd['class'].append('logreg')

    return pd.DataFrame(rd)

@@ -193,10 +238,11 @@ def evaluate_knn(x_train, y_train, x_test, y_test, kays=[3,5,7,9,11],
    recall, and accuracy to help find best model
    '''
    rd = {'predicted': [], 'k':[], 'threshold': [],
-          'precision': [], 'recall': [], 'accuracy':[]}
+          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    for k in kays:
-        scores = knn_classifier(x_train, y_train, x_test, k)
+        scores = knn_classifier(x_train, y_train, x_test, k)[:,1]
        for t in thresh:
+            scores = list(stats.rankdata(scores, 'average')/len(scores))
            preds = [compare_to_threshold(x, t) for x in scores]
            rd['predicted'].append(preds)
            rd['k'].append(k)
@@ -204,7 +250,8 @@ def evaluate_knn(x_train, y_train, x_test, y_test, kays=[3,5,7,9,11],
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
-            
+            rd['class'].append('knn')
+
    return pd.DataFrame(rd)

 def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
@@ -213,65 +260,89 @@ def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
    '''
    criterion = ['entropy', 'gini']
    rd = {'predicted': [], 'crit': [], 'threshold': [],
-          'precision': [], 'recall': [], 'accuracy':[]}
+          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    
    for c in criterion:
        scores = dectree_classifier(x_train, y_train, x_test, c)
        for t in thresh:
-            preds = [compare_to_threshold(x, t) for x in scores]
+            scores = list(stats.rankdata(scores, 'average')/len(scores))
+            preds = [compare_to_threshold(x, t) for x in list(scores)]
            rd['predicted'].append(preds)
            rd['crit'].append(c)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
+            rd['class'].append('dectree')

    return pd.DataFrame(rd)

 def evaluate_rf(x_train, y_train, x_test, y_test, thresh=thresh, ntrees=[25,100,500],
-                maxfeats=[1, .5, 1.0]):
+                maxfeats=[1, .5, 4]):
    rd = {'predicted': [], 'ntrees':[], 'nfeats': [], 'threshold': [],
-          'precision': [], 'recall': [], 'accuracy':[]}
+          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    for size in ntrees:
        for f in maxfeats:
            scores = random_forest_classifier(size, f, x_train, y_train, x_test)
            for t in thresh:
+                scores = list(stats.rankdata(scores, 'average')/len(scores))
                preds = [compare_to_threshold(x, t) for x in scores]
                rd['predicted'].append(preds)
                rd['ntrees'].append(size)
-                rd['maxfeats'].append(f)
+                rd['nfeats'].append(f)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
+                rd['class'].append('rf')
                
    return pd.DataFrame(rd)

 def evaluate_gb(x_train, y_train, x_test, y_test, thresh=thresh):
    rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [],
-          'accuracy':[]}
-    scores = gradient_boosting_classifier(x_train, y_train, x_test)
+          'accuracy':[], 'class': []}
+    scores = list(gradient_boosting_classifier(x_train, y_train, x_test))
    for t in thresh:
+        scores = list(stats.rankdata(scores, 'average')/len(scores))
        preds = [compare_to_threshold(x, t) for x in scores]
        rd['predicted'].append(preds)
        rd['threshold'].append(t)
        rd['precision'].append(precision(y_test, preds))
        rd['recall'].append(recall(y_test, preds))
        rd['accuracy'].append(accuracy(y_test, preds))
+        rd['class'].append('gb')
+        
+    return pd.DataFrame(rd)
+
+def evaluate_svm(x_train, y_train, x_test, y_test, thresh=thresh):
+    rd = {'predicted': [],'threshold': [], 'precision': [], 'recall': [],
+          'accuracy':[], 'class': []}
+    scores = svm_classifier(x_train, y_train, x_test)
+    for t in thresh:
+        scores = list(stats.rankdata(scores, 'average')/len(scores))
+        preds = [compare_to_threshold(x, t) for x in scores]
+        rd['predicted'].append(preds)
+        rd['threshold'].append(t)
+        rd['precision'].append(precision(y_test, preds))
+        rd['recall'].append(recall(y_test, preds))
+        rd['accuracy'].append(accuracy(y_test, preds))
+        rd['class'].append('svm')
        
    return pd.DataFrame(rd)

 def evaluate_bagging(x_train, y_train, x_test, y_test, thresh=thresh):
    rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [],
-          'accuracy':[]}
+          'accuracy':[], 'class': []}
    scores = bagging_classifier(x_train, y_train, x_test)
    for t in thresh:
-        preds = [compare_to_threshold(x, t) for x in scores]
+        scores = list(stats.rankdata(scores, 'average')/len(scores))
+        preds = [compare_to_threshold(x, t) for x in list(scores)]
        rd['predicted'].append(preds)
        rd['threshold'].append(t)
        rd['precision'].append(precision(y_test, preds))
        rd['recall'].append(recall(y_test, preds))
        rd['accuracy'].append(accuracy(y_test, preds))
+        rd['class'].append('bagging')
        
    return pd.DataFrame(rd)

@@ -298,6 +369,29 @@ def compare_to_threshold(score, threshold):
 '''
 Visualize Data
 '''
+def plot_roc(name, probs, true, output_type):
+    '''
+    professor's plot roc function from github magicloops
+    output_types: 'show', 'save'
+    '''
+    fpr, tpr, thresholds = roc_curve(true, probs)
+    roc_auc = auc(fpr, tpr)
+    pl.clf()
+    pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
+    pl.plot([0, 1], [0, 1], 'k--')
+    pl.xlim([0.0, 1.05])
+    pl.ylim([0.0, 1.05])
+    pl.xlabel('False Positive Rate')
+    pl.ylabel('True Positive Rate')
+    pl.title(name)
+    pl.legend(loc="lower right")
+    if (output_type == 'save'):
+        plt.savefig(name)
+    elif (output_type == 'show'):
+        plt.show()
+    else:
+        plt.show()
+
 def plot_precision_recall_curve(y_test, pred_probas):
    '''
    sklearn documentation says y_true goes first but this disagrees with our

--- a/hw3/results.csv
+++ b/hw3/results.csv