June 15, 2022

Welcome to the wonderful world of widgets.

Today's widget is a Jupyter Notebook file that takes us 20,000 leagues under the sea. The logistic regression model is trained to predict if the sonar input data is a rock or a mine!

In [41]:
#https://www.youtube.com/watch?v=fiz1ORTBGpY&list=PLfFghEzKVmjvuSA67LszN1dZ-Dd_pkus6
#https://re-thought.com/pandas-value_counts/

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
data = pd.read_csv("dataset/sonar.all-data 1.csv", header=None)
#prints first 5 rows
data.head()
# number of rows and columns
data.shape
# stats
data.describe()
# gives how many rocks and mines, 60 is column index; omit [60] in visual code
data[60].value_counts()
Out[41]:
M    111
R     97
Name: 60, dtype: int64
In [42]:
data.groupby(60).mean()
Out[42]:
0 1 2 3 4 5 6 7 8 9 ... 50 51 52 53 54 55 56 57 58 59
60
M 0.034989 0.045544 0.050720 0.064768 0.086715 0.111864 0.128359 0.149832 0.213492 0.251022 ... 0.019352 0.016014 0.011643 0.012185 0.009923 0.008914 0.007825 0.009060 0.008695 0.006930
R 0.022498 0.030303 0.035951 0.041447 0.062028 0.096224 0.114180 0.117596 0.137392 0.159325 ... 0.012311 0.010453 0.009640 0.009518 0.008567 0.007430 0.007814 0.006677 0.007078 0.006024

2 rows × 60 columns

In [44]:
#separate data and labels (rock and mine)

x = data.drop(columns=60,axis=1)
y = data[60]
print(x)
print(y)
         0       1       2       3       4       5       6       7       8   \
0    0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1    0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2    0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3    0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4    0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   
204  0.0323  0.0101  0.0298  0.0564  0.0760  0.0958  0.0990  0.1018  0.1030   
205  0.0522  0.0437  0.0180  0.0292  0.0351  0.1171  0.1257  0.1178  0.1258   
206  0.0303  0.0353  0.0490  0.0608  0.0167  0.1354  0.1465  0.1123  0.1945   
207  0.0260  0.0363  0.0136  0.0272  0.0214  0.0338  0.0655  0.1400  0.1843   

         9   ...      50      51      52      53      54      55      56  \
0    0.2111  ...  0.0232  0.0027  0.0065  0.0159  0.0072  0.0167  0.0180   
1    0.2872  ...  0.0125  0.0084  0.0089  0.0048  0.0094  0.0191  0.0140   
2    0.6194  ...  0.0033  0.0232  0.0166  0.0095  0.0180  0.0244  0.0316   
3    0.1264  ...  0.0241  0.0121  0.0036  0.0150  0.0085  0.0073  0.0050   
4    0.4459  ...  0.0156  0.0031  0.0054  0.0105  0.0110  0.0015  0.0072   
..      ...  ...     ...     ...     ...     ...     ...     ...     ...   
203  0.2684  ...  0.0203  0.0116  0.0098  0.0199  0.0033  0.0101  0.0065   
204  0.2154  ...  0.0051  0.0061  0.0093  0.0135  0.0063  0.0063  0.0034   
205  0.2529  ...  0.0155  0.0160  0.0029  0.0051  0.0062  0.0089  0.0140   
206  0.2354  ...  0.0042  0.0086  0.0046  0.0126  0.0036  0.0035  0.0034   
207  0.2354  ...  0.0181  0.0146  0.0129  0.0047  0.0039  0.0061  0.0040   

         57      58      59  
0    0.0084  0.0090  0.0032  
1    0.0049  0.0052  0.0044  
2    0.0164  0.0095  0.0078  
3    0.0044  0.0040  0.0117  
4    0.0048  0.0107  0.0094  
..      ...     ...     ...  
203  0.0115  0.0193  0.0157  
204  0.0032  0.0062  0.0067  
205  0.0138  0.0077  0.0031  
206  0.0079  0.0036  0.0048  
207  0.0036  0.0061  0.0115  

[208 rows x 60 columns]
0      R
1      R
2      R
3      R
4      R
      ..
203    M
204    M
205    M
206    M
207    M
Name: 60, Length: 208, dtype: object
In [55]:
#split into training and test data
#test_size is the fraction of the data for the test data; 0.1 is 10%
#stratify to have ~equal number of rocks and mines in training data
#x_train is the data; y_train is the label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=1)
print(x_train.shape, x_test.shape)
#print(x_train)
#print(y_train)
(187, 60) (21, 60)
In [54]:
#load model training logistic regression
model = LogisticRegression()
model.fit(x_train, y_train)
Out[54]:
LogisticRegression()
In [57]:
#Model accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print("Accuracy on Training Data : ", training_data_accuracy)
Accuracy on Training Data :  0.8342245989304813
In [60]:
#Model accuracy on test data; change all "train" to "test"
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print("Accuracy on Test Data : ", test_data_accuracy)
Accuracy on Test Data :  0.7619047619047619

Making a Predictive System

In [83]:
#known rock data
#input_data = (0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032)

#known mine data
#input_data = (0.0654,0.0649,0.0737,0.1132,0.2482,0.1257,0.1797,0.0989,0.2460,0.3422,0.2128,0.1377,0.4032,0.5684,0.2398,0.4331,0.5954,0.5772,0.8176,0.8835,0.5248,0.6373,0.8375,0.6699,0.7756,0.8750,0.8300,0.6896,0.3372,0.6405,0.7138,0.8202,0.6657,0.5254,0.2960,0.0704,0.0970,0.3941,0.6028,0.3521,0.3924,0.4808,0.4602,0.4164,0.5438,0.5649,0.3195,0.2484,0.1299,0.0825,0.0243,0.0210,0.0361,0.0239,0.0447,0.0394,0.0355,0.0440,0.0243,0.0098)

input_data = (0.04,0.04,0.07,0.12,0.242,0.127,0.17,0.089,0.20,0.2,0.18,0.17,0.42,0.54,0.98,0.41,0.1,0.72,0.176,0.835,0.528,0.633,0.875,0.669,0.776,0.870,0.800,0.696,0.372,0.605,0.718,0.822,0.667,0.524,0.290,0.074,0.090,0.391,0.608,0.351,0.394,0.488,0.462,0.414,0.548,0.569,0.315,0.244,0.129,0.085,0.023,0.020,0.031,0.039,0.047,0.094,0.055,0.040,0.043,0.008)

# input_data to numpy array bc process faster; converts list to numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape data for one data point, else model will be confused, does not know it is looking at one; "1" for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == "R"):
    print("Model predicts a rock")
else: 
    print ("!!! CAUTION !!! Model predicts a mine")
['M']
!!! CAUTION !!! Model predicts a mine