PySpark实战:描述统计

来自CloudWiki
跳转至: 导航搜索

介绍

统计分为描述统计和推论统计

所谓描述统计,通过图表或数学方法,对数据资料进行整理、分析,

并对数据的分布状态、字段数字特征和字段数值之间关系进行估计和描述

描述统计分为以下三大类;

  • 集中趋势分析:平均数、中数、众数登,表示数据的集中趋势
  • 离中趋势分析:主要靠最大值和最小值距离、四分差、平均差、方差、标准差等统计指标来研究数据的离中趋势
  • 相关分析:数据之间是否具有统计学上的相关性

实训步骤

计算基本的统计信息

#import findspark
#findspark.init()
##############################################
from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("PySpark ML") \
        .getOrCreate()
sc = spark.sparkContext
#############################################
df_train = spark.read.csv('./data/titanic-train.csv',header=True,inferSchema=True) \
     .cache()
#df_test = spark.read.csv('./data/titanic-test.csv',header=True,inferSchema=True).cache()
#计算基本的统计描述信息
df_train.describe("Age","Pclass","SibSp","Parch").show()
df_train.describe("Sex","Cabin","Embarked","Fare","Survived").show()


输出:

+-------+------------------+------------------+------------------+-------------------+
|summary|               Age|            Pclass|             SibSp|              Parch|
+-------+------------------+------------------+------------------+-------------------+
|  count|               714|               891|               891|                891|
|   mean| 29.69911764705882| 2.308641975308642|0.5230078563411896|0.38159371492704824|
| stddev|14.526497332334035|0.8360712409770491|1.1027434322934315| 0.8060572211299488|
|    min|              0.42|                 1|                 0|                  0|
|    max|              80.0|                 3|                 8|                  6|
+-------+------------------+------------------+------------------+-------------------+

+-------+------+-----+--------+-----------------+-------------------+
|summary|   Sex|Cabin|Embarked|             Fare|           Survived|
+-------+------+-----+--------+-----------------+-------------------+
|  count|   891|  204|     889|              891|                891|
|   mean|  null| null|    null| 32.2042079685746| 0.3838383838383838|
| stddev|  null| null|    null|49.69342859718089|0.48659245426485753|
|    min|female|  A10|       C|              0.0|                  0|
|    max|  male|    T|       S|         512.3292|                  1|
+-------+------+-----+--------+-----------------+-------------------+

性别与生存率的关系

pdf = df_train.groupBy('sex','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .orderBy("sex") \
     .toPandas()

print(pdf)

输出:

 sex  Survived  count
0  female         1    233
1  female         0     81
2    male         0    468
3    male         1    109

性别生存率可视化

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#防止中文乱码
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
width = 0.35
fig, ax = plt.subplots()
labels = ["female",'male']
print(pdf[pdf["Survived"]== 1])
print(pdf[pdf["Survived"]== 0])

male_vs =pdf[pdf["Survived"]== 1]["count"]
female_vs = pdf[pdf["Survived"]== 0]["count"]
print()
print(male_vs)#幸存的男性女性
print(female_vs)#牺牲的男性女性

ax.bar(labels, male_vs, width,  label='Survived')
ax.bar(labels, female_vs, width,  bottom=male_vs,
       label='UnSurvived')
ax.set_ylabel('性别')
ax.set_title('Survived和性别关系分析')
ax.legend()
plt.show()

Python21073101.png

其他因素与生还率

pdf = df_train.groupBy('Pclass','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .toPandas()
print(pdf)


pdf = df_train.groupBy('Parch','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .toPandas()
print(pdf)

pdf = df_train.groupBy('SibSp','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .toPandas()
print(pdf)
#############################################
sc.stop()

输出:

   Pclass  Survived  count
0       1         0     80
1       3         1    119
2       1         1    136
3       2         1     87
4       2         0     97
5       3         0    372
    Parch  Survived  count
0       1         0     53
1       3         1      3
2       6         0      1
3       4         0      4
4       1         1     65
5       0         0    445
6       2         1     40
7       0         1    233
8       2         0     40
9       5         0      4
10      5         1      1
11      3         0      2
    SibSp  Survived  count
0       1         0     97
1       3         1      4
2       4         0     15
3       1         1    112
4       0         0    398
5       2         1     13
6       0         1    210
7       2         0     15
8       5         0      5
9       8         0      7
10      3         0     12
11      4         1      3

完整代码


#import findspark
#findspark.init()
##############################################
from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("PySpark ML") \
        .getOrCreate()
sc = spark.sparkContext
#############################################
df_train = spark.read.csv('./data/titanic-train.csv',header=True,inferSchema=True) \
     .cache()
#df_test = spark.read.csv('./data/titanic-test.csv',header=True,inferSchema=True).cache()
#计算基本的统计描述信息
df_train.describe("Age","Pclass","SibSp","Parch").show()
df_train.describe("Sex","Cabin","Embarked","Fare","Survived").show()

pdf = df_train.groupBy('sex','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .orderBy("sex") \
     .toPandas()
#       sex  Survived  count
# 0  female         1    233
# 1  female         0     81
# 2    male         0    468
# 3    male         1    109
print(pdf)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#防止中文乱码
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
width = 0.35
fig, ax = plt.subplots()
labels = ["female",'male']
print(pdf[pdf["Survived"]== 1])
print(pdf[pdf["Survived"]== 0])

male_vs =pdf[pdf["Survived"]== 1]["count"]
female_vs = pdf[pdf["Survived"]== 0]["count"]
print()
print(male_vs)#幸存的男性女性
print(female_vs)#牺牲的男性女性

ax.bar(labels, male_vs, width,  label='Survived')
ax.bar(labels, female_vs, width,  bottom=male_vs,
       label='UnSurvived')
ax.set_ylabel('性别')
ax.set_title('Survived和性别关系分析')
ax.legend()
plt.show()

pdf = df_train.groupBy('Pclass','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .toPandas()
print(pdf)


pdf = df_train.groupBy('Parch','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .toPandas()
print(pdf)

pdf = df_train.groupBy('SibSp','Survived') \
     .agg({'PassengerId': 'count'}) \
     .withColumnRenamed("count(PassengerId)","count") \
     .toPandas()
print(pdf)
#############################################
sc.stop()
#'''