June 15, 2022
Today's widget is a Jupyter Notebook file that takes us 20,000 leagues under the sea. The logistic regression model is trained to predict if the sonar input data is a rock or a mine!
#https://www.youtube.com/watch?v=fiz1ORTBGpY&list=PLfFghEzKVmjvuSA67LszN1dZ-Dd_pkus6
#https://re-thought.com/pandas-value_counts/
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
data = pd.read_csv("dataset/sonar.all-data 1.csv", header=None)
#prints first 5 rows
data.head()
# number of rows and columns
data.shape
# stats
data.describe()
# gives how many rocks and mines, 60 is column index; omit [60] in visual code
data[60].value_counts()
data.groupby(60).mean()
#separate data and labels (rock and mine)
x = data.drop(columns=60,axis=1)
y = data[60]
print(x)
print(y)
#split into training and test data
#test_size is the fraction of the data for the test data; 0.1 is 10%
#stratify to have ~equal number of rocks and mines in training data
#x_train is the data; y_train is the label
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=1)
print(x_train.shape, x_test.shape)
#print(x_train)
#print(y_train)
#load model training logistic regression
model = LogisticRegression()
model.fit(x_train, y_train)
#Model accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print("Accuracy on Training Data : ", training_data_accuracy)
#Model accuracy on test data; change all "train" to "test"
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print("Accuracy on Test Data : ", test_data_accuracy)
Making a Predictive System
#known rock data
#input_data = (0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032)
#known mine data
#input_data = (0.0654,0.0649,0.0737,0.1132,0.2482,0.1257,0.1797,0.0989,0.2460,0.3422,0.2128,0.1377,0.4032,0.5684,0.2398,0.4331,0.5954,0.5772,0.8176,0.8835,0.5248,0.6373,0.8375,0.6699,0.7756,0.8750,0.8300,0.6896,0.3372,0.6405,0.7138,0.8202,0.6657,0.5254,0.2960,0.0704,0.0970,0.3941,0.6028,0.3521,0.3924,0.4808,0.4602,0.4164,0.5438,0.5649,0.3195,0.2484,0.1299,0.0825,0.0243,0.0210,0.0361,0.0239,0.0447,0.0394,0.0355,0.0440,0.0243,0.0098)
input_data = (0.04,0.04,0.07,0.12,0.242,0.127,0.17,0.089,0.20,0.2,0.18,0.17,0.42,0.54,0.98,0.41,0.1,0.72,0.176,0.835,0.528,0.633,0.875,0.669,0.776,0.870,0.800,0.696,0.372,0.605,0.718,0.822,0.667,0.524,0.290,0.074,0.090,0.391,0.608,0.351,0.394,0.488,0.462,0.414,0.548,0.569,0.315,0.244,0.129,0.085,0.023,0.020,0.031,0.039,0.047,0.094,0.055,0.040,0.043,0.008)
# input_data to numpy array bc process faster; converts list to numpy array
input_data_as_numpy_array = np.asarray(input_data)
#reshape data for one data point, else model will be confused, does not know it is looking at one; "1" for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
prediction = model.predict(input_data_reshaped)
print(prediction)
if(prediction[0] == "R"):
print("Model predicts a rock")
else:
print ("!!! CAUTION !!! Model predicts a mine")