Python ML Pages - 1 2 3 4 5 6
Here are some of the best libraries for speeding model development, with an explanation of how they do it.
Important: winequality-red.csv file - it is part of Py-LM-example.zip
Download Python Project : Py-LM-example.zip - (165 KB zip file) Download winequality-red.csv file - it is part of Py-LM-example.zip
Source Code of Python Project - Py-LM-example :
Download this Source Code at python file: Py_LM_example.py - (1.59 KB Python file) download
# pip install pandas
# pip install pearsonr
# calculate summary stats
from
import
# create a simple list
mylist=[1,2,3,4,5,6,7,8,9,10]
# calculate statistics
print
# Compute correlation
# calculate correlation coefficient
# Load winequality-red wine dataset data using winequality-red.csv - excel file
sr = pd.read_csv(
#, header=0).iloc[:-1]
# Print the first few rows using the head() function.
header =
# prepare data
data1 = sr[
data2 = sr[
data3 = sr[
data4 = sr[
# calculate Pearson's correlation
corr, p = pearsonr(data1, data2)
# display the correlation
corr, p = pearsonr(data1, data3)
corr, p = pearsonr(data1, data4)
# correlation_matrix of dataset data - winequality-red.csv - excel file
correlation_matrix = sr.corr()
Output of Py-LM-example project
Important: winequalityN.csv file - it is part of Py-LM-QPrediction.zip
Download Python Project : Py-LM-QPrediction.zip - (112 KB zip file) Download winequalityN.csv file - it is part of Py-LM-QPrediction.zip
Source Code of Python Project -Py-LM-QPrediction :
Download this Source Code at python file: Py_LM_QPrediction.py - (2.11 KB Python file) download
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')
#1- look at the first five rows of the dataset.
df = pd.read_csv('winequalityN.csv')
print("1- df.head >>> ")
print(df.head())
print("")
#2- explore the type of data present in each of the columns present in the dataset.
print("2- df.info >>> ")
print(df.info())
#3- explore the descriptive statistical measures of the dataset.
print("3- df.describe().T >>> ")
print(df.describe().T)
#4- Exploratory Data Analysis, EDA, is an approach to analysing the data using visual techniques.
print("4- df.isnull().sum() >>> ")
print(df.isnull().sum())
#5- impute the missing values by means as the data present in the different columns are continuous values.
for col in df.columns:
if df[col].isnull().sum() > 0:
df[col] = df[col].fillna(df[col].mean())
print("5- df.isnull().sum().sum() >>> ")
print(df.isnull().sum().sum())
#6- draw the histogram to visualise the distribution of the data with continuous values in the columns of the dataset.
#plt.figure(1)
df.hist(bins=22, figsize=(9, 9))
#plt.show()
#-------
#7- draw the count plot to visualise the number data for each quality of wine.
plt.figure(2)
plt.bar(df['quality'], df['alcohol'])
plt.xlabel('quality')
plt.ylabel('alcohol')
plt.show()
#8- Convert 'object' columns to numerical if they represent numbers
if df[col].dtype == 'object':
try:
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, replace non-convertibles with NaN
except:
pass # Skip columns that cannot be converted
# This code is modified by Susobhan Akhuli
plt.figure(3)
#plt.figure(figsize=(12, 12))
sb.heatmap(df.corr() > 0.7, annot=True, cbar=False)
#9- remove the 'total sulphur dioxide' and 'free sulphur dioxide'
#df = df.drop('total sulfur dioxide', axis=1)
if __name__ == "__main__":
print()
Output of Python Project -Py-LM-QPrediction :
Important: winequalityN.csv file - it is part of Py-LM-Development.zip
Download Python Project : Py-LM-Development.zip - (113 KB zip file) Download winequalityN.csv file - it is part of Py-LM-Development.zip
Source Code of Python Project -Py-LM-Development :
Download this Source Code at python file: Py_LM_Development.py - (2.96 KB Python file) download
#pip install: Sklearn, scikit-learn, xgboost
from sklearn.model_selection import train_test_split #train_test_split, it is part of scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBClassifier #XGBClassifier is part of XGBoost
from sklearn.linear_model import LogisticRegression
#1a- look at the first five rows of the dataset.
print("1a- df.head >>> ")
#2a- Model Development
#We will train some of the state of the art machine learning classification models and then select best out of them using validation data.
df['best quality'] = [1 if x > 5 else 0 for x in df.quality]
# have a column with object data type as well let's replace it with the 0 and 1 as there are only two categories.
df.replace({'white': 1, 'red': 0}, inplace=True)
#After segregating features and the target variable from the dataset we will split it into 80:20 ratio for model selection.
#features = features.fillna(features.mean())
features = df.drop(['quality', 'best quality'], axis=1)
target = df['best quality']
xtrain, xtest, ytrain, ytest = train_test_split(
features, target, test_size=0.2, random_state=40)
# Impute missing values after splitting
from sklearn.impute import SimpleImputer # SimpleImputer it is part of scikit-learn
imputer = SimpleImputer(strategy='mean') # Or another strategy like 'median'
xtrain = imputer.fit_transform(xtrain)
xtest = imputer.transform(xtest)
xtrain.shape, xtest.shape
print("2a- Model Development - xtrain.shape, xtest.shape >>> ")
print(xtrain.shape, xtest.shape)
#3a- Normalising the data before training help us to achieve stable and fast training of the model.
norm = MinMaxScaler()
xtrain = norm.fit_transform(xtrain)
xtest = norm.transform(xtest)
models = [LogisticRegression(), XGBClassifier(), SVC(kernel='rbf')]
print("3a- Normalising the data >>> ")
for i in range(3):
models[i].fit(xtrain, ytrain)
#print(f'{models[i]} : ')
print('<<< for i = >>>', i)
print("<<< Models >>> ", f'{models[i]} : ')
print('<<< Training Accuracy : ', metrics.roc_auc_score(ytrain, models[i].predict(xtrain)))
print('<<< Validation Accuracy : ', metrics.roc_auc_score(
ytest, models[i].predict(xtest)))
#4a- Model Evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Assuming 'models[1]' is your trained classifier
cm = confusion_matrix(ytest, models[1].predict(xtest))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=models[1].classes_) # Assuming your model has a 'classes_' attribute
disp.plot()
Output of Python Project -Py-LM-Development :