Python数据分析:实用向( 四 )

画核密度估计fig = plt.figure(figsize=(15,8),)## I have included to different ways to code a plot behigh, choose the one that suites you.ax=sns.kdeplot(data.client[data.cvr_group_high == 0] ,color='gray',shade=True,label='high')ax=sns.kdeplot(data.loc[(data['cvr_group_high'] == 1),'client'] ,color='g',shade=True,label='high',)plt.title('client - high vs high', fontsize = 25, pad = 40)plt.ylabel("Frequency of cvr", fontsize = 15, labelpad = 20)plt.xlabel("Client", fontsize = 15,labelpad =20)## Converting xticks into words for better understandinglabels = ['H5', 'android', 'ios','pc','wap']plt.xticks(sorted(data.client.unique()), labels)plt.legend()模型训练导入模块#加载模块from sklearn.preprocessing import StandardScalerimport warningswarnings.filterwarnings("ignore") #过滤掉警告的意思from pyforest import *import pandas as pdimport numpy as npfrom sklearn.ensemble import RandomForestClassifier#随机森林from sklearn.svm import SVC,LinearSVC#支持向量机from sklearn.linear_model import LogisticRegression#逻辑回归from sklearn.neighbors import KNeighborsClassifier#KNN算法from sklearn.cluster import KMeans#K-Means 聚类算法from sklearn.naive_bayes import GaussianNB#朴素贝叶斯from sklearn.tree import DecisionTreeClassifier#决策树import xgboost as xgbfrom xgboost import XGBClassifierfrom catboost import CatBoostClassifierfrom sklearn.ensemble import AdaBoostClassifierfrom sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import classification_report,precision_score,recall_score,f1_score,accuracy_score #分类报告from sklearn.metrics import confusion_matrix#混淆矩阵from sklearn.metrics import silhouette_score#轮廓系数(评价k-mean聚类效果)from sklearn.model_selection import GridSearchCV#交叉验证from sklearn.metrics import make_scorerfrom sklearn.ensemble import VotingClassifier#投票def plot_predictions(test,predicted):#整体平移x=np.arange(0,len(test))+1#x[0]=1#my_x_ticks = np.arange(1, 14, 1)#plt.xticks(my_x_ticks)plt.plot(x,test,label='Real')plt.plot(x,predicted,color='darkOrange',linestyle='--',label='Predicted')#plt.xlabel('month')plt.ylabel('count')plt.legend()import mathdef mse_loss(y_true, y_pred):return np.sum(np.power(y_true - y_pred, 2)) / y_true.shape[0] / 2def return_rmse(test,predicted):rmse = math.sqrt(mse_loss(test, predicted))return rmse#print("The mean squared error is {}.".format(rmse))Classifiers=[["Random Forest",RandomForestClassifier()],["Support Vector Machine",SVC()],["LogisticRegression",LogisticRegression()],["KNN",KNeighborsClassifier(n_neighbors=5)],["Naive Bayes",GaussianNB()],["Decision Tree",DecisionTreeClassifier()],["AdaBoostClassifier",AdaBoostClassifier()],["GradientBoostingClassifier", GradientBoostingClassifier()],["XGB", XGBClassifier()],]设置训练集X=train.drop(['目标客户编号','品牌类型','购买意愿'], axis = 1)# X=train.drop(['目标客户编号','品牌类型'], axis = 1)t=Xheaders = X.columnsX= X.astype(float)y = train["购买意愿"]训练模型import warningswarnings.filterwarnings('ignore')Classify_result=[]names=[]prediction=[]for name,classifier in Classifiers:classifier=classifierclassifier.fit(X_train,y_train)y_pred=classifier.predict(X_test)recall=recall_score(y_test,y_pred,average='macro')precision=precision_score(y_test,y_pred,average='macro')f1score = f1_score(y_test, y_pred,average='macro')mse = return_rmse(y_test,y_pred)class_eva=pd.DataFrame([recall,precision,f1score,mse])Classify_result.append(class_eva)name=pd.Series(name)names.append(name)y_pred=pd.Series(y_pred)prediction.append(y_pred)plot_predictions(y_test,y_pred)## plt.savefig('seven1.png', dpi=300)plt.show()模型评估names=pd.DataFrame(names)names=names[0].tolist()result=pd.concat(Classify_result,axis=1)result.columns=namesresult.index=["recall","precision","f1score","mse"]result小工具tqdm显示进度条
from tqdm import tqdmfor I in tqdm():记录时间
Import timetime_begin = time.time()#code,你的程序time_end = time.time()time = time_end - time_beginprint('time:', time)

经验总结扩展阅读