#Importing the Librariesimport numpy as npimport pandas as pdimport datetimeimport matplotlibimport matplotlib.pyplot as pltfrom matplotlib import colorsimport seaborn as snsfrom sklearn.preprocessing import LabelEncoderfrom sklearn.preprocessing import StandardScalerfrom sklearn.decomposition import PCAfrom yellowbrick.cluster import KElbowVisualizerfrom sklearn.cluster import KMeansimport matplotlib.pyplot as plt, numpy as npfrom mpl_toolkits.mplot3d import Axes3Dfrom sklearn.cluster import AgglomerativeClusteringfrom matplotlib.colors import ListedColormapfrom sklearn import metricsimport warningsimport sysif not sys.warnoptions: warnings.simplefilter("ignore")np.random.seed(42)#Loading the datasetdata = pd.read_csv("1_marketing_campaign.csv", sep="\t")print("Number of datapoints:", len(data))data.head()#Information on features data.info()data = data.dropna()print("The total number of data-points after removing the rows with missing values are:", len(data))data["Dt_Customer"] = pd.to_datetime(data["Dt_Customer"], dayfirst=True)dates = []for i in data["Dt_Customer"]: i = i.date() dates.append(i) # Dates of the newest and oldest recorded customerprint("The newest customer's enrolment date in the records:", max(dates))print("The oldest customer's enrolment date in the records:", min(dates))#Created a feature "Customer_For"days = []d1 = max(dates) #taking it to be the newest customerfor i in dates: delta = d1 - i days.append(delta)data["Customer_For"] = daysdata["Customer_For"] = pd.to_numeric(data["Customer_For"], errors="coerce")print("Total categories in the feature Marital_Status:\n", data["Marital_Status"].value_counts(), "\n")print("Total categories in the feature Education:\n", data["Education"].value_counts())#Feature Engineering#Age of customer today data["Age"] = 2021-data["Year_Birth"]
#Total spendings on various itemsdata["Spent"] = data["MntWines"]+ data["MntFruits"]+ data["MntMeatProducts"]+ data["MntFishProducts"]+ data["MntSweetProducts"]+ data["MntGoldProds"]
#Deriving living situation by marital status"Alone"data["Living_With"]=data["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone", "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone",})
#Feature indicating total children living in the householddata["Children"]=data["Kidhome"]+data["Teenhome"]
#Feature for total members in the householdedata["Family_Size"] = data["Living_With"].replace({"Alone": 1, "Partner":2})+ data["Children"]
#Feature pertaining parenthooddata["Is_Parent"] = np.where(data.Children> 0, 1, 0)
#Segmenting education levels in three groupsdata["Education"]=data["Education"].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"})
#For claritydata=data.rename(columns={"MntWines": "Wines","MntFruits":"Fruits","MntMeatProducts":"Meat","MntFishProducts":"Fish","MntSweetProducts":"Sweets","MntGoldProds":"Gold"})
#Dropping some of the redundant featuresto_drop = ["Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Year_Birth", "ID"]data = data.drop(to_drop, axis=1)data.describe()#To plot some selected features #Setting up colors prefrencessns.set(rc={"axes.facecolor":"#FFF9ED","figure.facecolor":"#FFF9ED"})pallet = ["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"]cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"])#Plotting following featuresTo_Plot = [ "Income", "Recency", "Customer_For", "Age", "Spent", "Is_Parent"]print("Reletive Plot Of Some Selected Features: A Data Subset")plt.figure()sns.pairplot(data[To_Plot], hue= "Is_Parent",palette= (["#682F2F","#F3AB60"]))#Taking hue plt.show()#Dropping the outliers by setting a cap on Age and income. data = data[(data["Age"]<90)]data = data[(data["Income"]<600000)]print("The total number of data-points after removing the outliers are:", len(data))import pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns
# 选择数值列numeric_data = data.select_dtypes(include=[float, int])
# 计算相关性矩阵corrmat = numeric_data.corr()
# 绘制热力图plt.figure(figsize=(20, 20))sns.heatmap(corrmat, annot=True, cmap='coolwarm', center=0)plt.title('Correlation Matrix Heatmap')plt.show()#Get list of categorical variabless = (data.dtypes == 'object')object_cols = list(s[s].index)
print("Categorical variables in the dataset:", object_cols)#Label Encoding the object dtypes.LE=LabelEncoder()for i in object_cols: data[i]=data[[i]].apply(LE.fit_transform) print("All features are now numerical")#Creating a copy of datads = data.copy()# creating a subset of dataframe by dropping the features on deals accepted and promotionscols_del = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Response']ds = ds.drop(cols_del, axis=1)#Scalingscaler = StandardScaler()scaler.fit(ds)scaled_ds = pd.DataFrame(scaler.transform(ds),columns= ds.columns )print("All features are now scaled")#Scaled data to be used for reducing the dimensionalityprint("Dataframe to be used for further modelling:")scaled_ds.head()#Initiating PCA to reduce dimentions aka features to 3pca = PCA(n_components=3)pca.fit(scaled_ds)PCA_ds = pd.DataFrame(pca.transform(scaled_ds), columns=(["col1","col2", "col3"]))PCA_ds.describe().T#A 3D Projection Of Data In The Reduced Dimensionx =PCA_ds["col1"]y =PCA_ds["col2"]z =PCA_ds["col3"]#To plotfig = plt.figure(figsize=(10,8))ax = fig.add_subplot(111, projection="3d")ax.scatter(x,y,z, c="maroon", marker="o" )ax.set_title("A 3D Projection Of Data In The Reduced Dimension")plt.show()# Quick examination of elbow method to find numbers of clusters to make.print('Elbow Method to determine the number of clusters to be formed:')Elbow_M = KElbowVisualizer(KMeans(), k=10)Elbow_M.fit(PCA_ds)Elbow_M.show()#Initiating the Agglomerative Clustering model AC = AgglomerativeClustering(n_clusters=4)# fit model and predict clustersyhat_AC = AC.fit_predict(PCA_ds)PCA_ds["Clusters"] = yhat_AC#Adding the Clusters feature to the orignal dataframe.data["Clusters"]= yhat_AC#Plotting the clustersfig = plt.figure(figsize=(10,8))ax = plt.subplot(111, projection='3d', label="bla")ax.scatter(x, y, z, s=40, c=PCA_ds["Clusters"], marker='o', cmap = cmap )ax.set_title("The Plot Of The Clusters")plt.show()#Plotting countplot of clusterspal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]pl = sns.countplot(x=data["Clusters"], palette= pal)pl.set_title("Distribution Of The Clusters")plt.show()pl = sns.scatterplot(data = data,x=data["Spent"], y=data["Income"],hue=data["Clusters"], palette= pal)pl.set_title("Cluster's Profile Based On Income And Spending")plt.legend()plt.show()plt.figure()pl=sns.swarmplot(x=data["Clusters"], y=data["Spent"], color= "#CBEDDD", alpha=0.5 )pl=sns.boxenplot(x=data["Clusters"], y=data["Spent"], palette=pal)plt.show()#Creating a feature to get a sum of accepted promotions data["Total_Promos"] = data["AcceptedCmp1"]+ data["AcceptedCmp2"]+ data["AcceptedCmp3"]+ data["AcceptedCmp4"]+ data["AcceptedCmp5"]#Plotting count of total campaign accepted.plt.figure()pl = sns.countplot(x=data["Total_Promos"],hue=data["Clusters"], palette= pal)pl.set_title("Count Of Promotion Accepted")pl.set_xlabel("Number Of Total Accepted Promotions")plt.show()#Plotting the number of deals purchasedplt.figure()pl=sns.boxenplot(y=data["NumDealsPurchases"],x=data["Clusters"], palette= pal)pl.set_title("Number of Deals Purchased")plt.show()Personal = [ "Kidhome","Teenhome","Customer_For", "Age", "Children", "Family_Size", "Is_Parent", "Education","Living_With"]
for i in Personal: plt.figure() sns.jointplot(x=data[i], y=data["Spent"], hue =data["Clusters"], kind="kde", palette=pal) plt.show()
评论