A predictive churn model is a powerful tool for identifying which of your customers will stop engaging with your business. With that information, you can built retention strategies, discount offers, email campaigns, and more that keep your high-value customers buying.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('Churn_Modeling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
dataset.head()
print(X)
print('\n')
print(y)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
print("X -> {}".format(X))
print('\n')
print("y -> {}".format(y))
dataset.describe()
dataset.columns
Germany looses most of the customers, company must look into it.
Females customers are more likely to leave the bank.
High churn rate among customers of age 45-60.
Balance is not creating a significant impact on churn.
Credit Score is not creating a significant impact on churn.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
y_lr_pred = lr_classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print("confusion_matrix:\n {}".format(confusion_matrix(y_test, y_lr_pred)))
print("\nclassification_report: \n {}".format(classification_report(y_test, y_lr_pred)))
from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
y_svm_pred = svm_classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print("confusion_matrix:\n {}".format(confusion_matrix(y_test, y_svm_pred)))
print("\nclassification_report: \n {}".format(classification_report(y_test, y_svm_pred)))
from sklearn.ensemble import RandomForestClassifier
Rf_classifier = RandomForestClassifier()
Rf_classifier.fit(X_train, y_train)
y_rf_pred = Rf_classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print("confusion_matrix:\n {}".format(confusion_matrix(y_test, y_rf_pred)))
print("\nclassification_report: \n {}".format(classification_report(y_test, y_rf_pred)))
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))
# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print (cm)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)
X_pca
pca_df = pd.DataFrame(data=X_pca, columns=["pca 1", "pca 2"])
pca_df["pred"] = y_train
Here there's a close competition but Support Vector Machines win with the Precision = 0.86, Recall =0.86 and F1-score = 0.85.