文件处理导包import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline
添加镜像
https://mirrors.tuna.tsinghua.edu.cn/https://developer.aliyun.com/mirror/http://mirrors.163.com/ubuntu/https://mirrors.ustc.edu.cn/http://mirrors.zju.edu.cn/http://mirrors.sohu.com/http://ftp.sjtu.edu.cn/http://mirror.bjtu.edu.cn/http://mirror.bjtu.edu.cn/
语法其中
http
和https
是可选的! pip install xxx -i https://mirrors.tuna.tsinghua.edu.cn/
导入文件exceldata=https://www.huyubaike.com/biancheng/pd.read_excel(r"C:\Users\ranxi\Desktop\附录1 目标客户体验数据.xlsx", sheet_name='data')data.head()
csvdata=https://www.huyubaike.com/biancheng/pd.read_csv()
EDA报告#生成报告import pandas_profilingdata.profile_report()#输出报告文件pfr = pandas_profiling.ProfileReport(data)pfr.to_file('report.html')
dataframe导出excel文件data.to_excel('data.xlsx')
数据处理数据筛选分类均值展示cvr_summary = data.groupby("cvr_group_high")cvr_summary.mean().reset_index()
标签编码print("client","--" ,data.client.unique())from sklearn.preprocessing import LabelEncoderdata.client = LabelEncoder().fit_transform(data.client)print("client","--" ,data.client.unique())
交叉比例表pd.crosstab(data['invited_is'],data["cvr_group_high"],normalize=0)
计算分布比例def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)percent_value_counts(data, "B7")
多列apply函数with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)
卡方检验#分组间确实是有显著性差异 , 频数比较的结论才有可信度 , 故需进行”卡方检验“from scipy.stats import chi2_contingency#统计分析 卡方检验#自定义卡方检验函数def KF(x):df1=pd.crosstab(data2['购买意愿'],data2[x])li1=list(df1.iloc[0,:])li2=list(df1.iloc[1,:])kf_data=https://www.huyubaike.com/biancheng/np.array([li1,li2])kf=chi2_contingency(kf_data)if kf[1]<0.05:print('购买意愿 by {} 的卡方临界值是{:.2f} , 小于0.05 , 表明{}组间有显著性差异,可进行【交叉分析】'.format(x,kf[1],x),'\n')else:print('购买意愿 by {} 的卡方临界值是{:.2f} , 大于0.05 , 表明{}组间无显著性差异,不可进行交叉分析'.format(x,kf[1],x),'\n')#对 kf_var进行卡方检验print('kf_var的卡方检验结果如下:','\n')print(list(map(KF, kf_var)))
条件筛选specific=data[(data['a1']>100)|(data['a2']>100)|(data['a3']>100)|(data['a4']>100)|(data['a5']>100)|(data['a6']>100)|(data['a7']>100)|(data['a8']>100)]specific
specific=data[(data['']>x)|&()]data[data.Cabin=='N']
map函数分组def hour_group_fun(hour):x = ''if 0<=hour<8:x=1elif 8<=hour<16:x=2else:x=3return x## Applying function to the column.police['hour_group'] =police['hour'].map(hour_group_fun)
apply多列赋值with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)
这是一个分布比例函数def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)
经验总结扩展阅读
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 给父母买什么礼物最实用
- 微波炉和烤箱哪个更实用
- python ROS2时间同步
- 图文 Python 嵌入式打包
- 教师节礼物送什么实用
- 送大学生什么礼物好实用
- 媳妇生日送什么礼物好实用的
- 其三 Gitea 1.18 功能前瞻:增强文本预览效果、继续扩展软件包注册中心、增强工单实用功能、完善了用户邀请机制和SEO
- 3 Python全栈工程师之从网页搭建入门到Flask全栈项目实战 - 入门Flask微框架
- 跟我学Python图像处理丨图像特效处理:毛玻璃、浮雕和油漆特效