“数据分析案例:借贷平台”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
数据分析
数据分析
第1行: 第1行:
  
 
== 数据分析==
 
== 数据分析==
 +
== 建表语句 ==
 +
建库
  
[[https://imgchr.com/i/BXrC6J]]
+
'''create database 库名;'''
**2.上传数据**
 
  
创建指定文件夹,将数据上传至hdfs指定目录/college/下
+
'''建表,Prosper借贷平台 共81个字段'''
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
create table Prosper(
hadoop fs -mkdir -p /college
 
hadoop fs -put /root/college/bike.csv /college/
 
hadoop fs -ls /college/
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
3.建库建表
+
ListingKey string ,
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
ListingNumber int,
create database hive;
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
ListingCreationDate TIMESTAMP,
create table bike(
 
duration int,
 
startdate timestamp,
 
enddate timestamp,
 
startnum int,
 
startstation string,
 
endnum int,
 
endstation string,
 
bikenum string,
 
type string)
 
row format delimited fields terminated by ',';
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
CreditGrade string,
create table bike(
 
tid int ,
 
author string,
 
score string,
 
grade string,
 
title string,
 
content string)
 
row format delimited fields terminated by ',';
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
Term int,
create table bike(
 
duration int,
 
startdate string,
 
enddate string,
 
startnum int,
 
startstation string,
 
endnum string,
 
endstation string,
 
bikenum timestamp,
 
type timestamp,
 
      string,
 
      string,
 
      timestamp,
 
      timestamp,
 
      string)
 
row format delimited fields terminated by ' ';
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
LoanStatus string,
create table train_format(
 
user_id int,
 
age_range int,
 
gender int,
 
merchant_id int,
 
label int,
 
activity_log string)
 
row format delimited fields terminated by ',';
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
**4.数据导入**
+
ClosedDate TIMESTAMP,
  
本地数据导入    local意思是本地数据“/root/college/bike.csv”
+
BorrowerAPR float,
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
BorrowerRate float,
LOAD DATA LOCAL INPATH '数据的路径' INTO TABLE 表名;
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
统计表数据下入本地目录中,local意思是本地数据“
+
LenderYield float,
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
EstimatedEffectiveYield float,
INSERT OVERWRITE LOCAL DIRECTORY '路径’
 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t’
 
SELECT 字段1, 字段2, 字段3 FROM 表名 ;
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
![1595473261254](C:\Users\DELL\AppData\Roaming\Typora\typora-user-images\1595473261254.png)
+
EstimatedLoss float,
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
EstimatedReturn float,
INSERT OVERWRITE LOCAL DIRECTORY'/root/college001/01/'
 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
 
SELECT count(distinct bikenum) FROM bike;
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
![1595473333500](C:\Users\DELL\AppData\Roaming\Typora\typora-user-images\1595473333500.png)
+
ProsperRating(numeric) int,
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
ProsperRating (Alpha) string,
INSERT OVERWRITE LOCAL DIRECTORY 'root/college001/'
 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
 
SELECT count(distinct bikenum) FROM bike;
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
![1594389021724](C:\Users\DELL\AppData\Roaming\Typora\typora-user-images\1594389021724.png)
+
ProsperScore int,
  
用join 或者左右,连接
+
ListingCategory (numeric) int,
  
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
+
BorrowerState string,
select startstation,count(*) as total from bike
 
where endstation = ''
 
group by total
 
order by total desc limit 1;
 
[[特殊:用户贡献/58.56.20.138|58.56.20.138]]
 
  
###
+
Occupation string,
 +
 
 +
EmploymentStatus string,
 +
 
 +
EmploymentStatusDuration int,
 +
 
 +
IsBorrowerHomeowner string,
 +
 
 +
CurrentlyInGroup string,
 +
 
 +
GroupKey string,
 +
 
 +
DateCreditPulled TIMESTAMP,
 +
 
 +
CreditScoreRangeLower int,
 +
 
 +
CreditScoreRangeUpper int,
 +
 
 +
FirstRecordedCreditLine TIMESTAMP,
 +
 
 +
CurrentCreditLines int,
 +
 
 +
OpenCreditLines int,
 +
 
 +
TotalCreditLinespast7years int,
 +
 
 +
OpenRevolvingAccounts int,
 +
 
 +
OpenRevolvingMonthlyPayment int,
 +
 
 +
InquiriesLast6Months int,
 +
 
 +
TotalInquiries int,
 +
 
 +
CurrentDelinquencies int,
 +
 
 +
AmountDelinquent int,
 +
 
 +
DelinquenciesLast7Years int,
 +
 
 +
PublicRecordsLast10Years int,
 +
 
 +
PublicRecordsLast12Months int,
 +
 
 +
RevolvingCreditBalance int,
 +
 
 +
BankcardUtilization float,
 +
 
 +
AvailableBankcardCredit int,
 +
 
 +
TotalTrades int,
 +
 
 +
TradesNeverDelinquent (percentage) float,
 +
 
 +
TradesOpenedLast6Months int,
 +
 
 +
DebtToIncomeRatio float,
 +
 
 +
IncomeRange string,
 +
 
 +
IncomeVerifiable string,
 +
 
 +
StatedMonthlyIncome double,
 +
 
 +
LoanKey string,
 +
 
 +
TotalProsperLoans int,
 +
 
 +
TotalProsperPaymentsBilled int,
 +
 
 +
OnTimeProsperPayments int,
 +
 
 +
ProsperPaymentsLessThanOneMonthLate int,
 +
 
 +
ProsperPaymentsOneMonthPlusLate int,
 +
 
 +
ProsperPrincipalBorrowed int,
 +
 
 +
ProsperPrincipalOutstanding double,
 +
 
 +
ScorexChangeAtTimeOfListing int,
 +
 
 +
LoanCurrentDaysDelinquent int,
 +
 
 +
LoanFirstDefaultedCycleNumber int,
 +
 
 +
LoanMonthsSinceOrigination int,
 +
 
 +
LoanNumber int,
 +
 
 +
LoanOriginalAmount int,
 +
 
 +
LoanOriginationDate TIMESTAMP,
 +
 
 +
LoanOriginationQuarter string,
 +
 
 +
MemberKey string,
 +
 
 +
MonthlyLoanPayment float,
 +
 
 +
LP_CustomerPayments double,
 +
 
 +
LP_CustomerPrincipalPayments double,
 +
 
 +
LP_InterestandFees double,
 +
LP_ServiceFees double,
 +
 
 +
LP_CollectionFees double,
 +
 
 +
LP_GrossPrincipalLoss double,
 +
 
 +
LP_NetPrincipalLoss double,
 +
 
 +
LP_NonPrincipalRecoverypayments double,
 +
 
 +
PercentFunded double,
 +
 
 +
Recommendations int,
 +
 
 +
InvestmentFromFriendsCount int,
 +
 
 +
InvestmentFromFriendsAmount double,
 +
 
 +
Investors int
 +
 
 +
)row format delimited
 +
 
 +
fields terminated by ',';
 +
 
 +
'''提取有用数据建表'''
 +
 
 +
create table test(
 +
 
 +
LoanStatus string,
 +
 
 +
BorrowerRate float,
 +
 
 +
ProsperScore int,
 +
 
 +
Occupation string,
 +
 
 +
EmploymentStatus string,
 +
 
 +
IsBorrowerHomeowner string,
 +
 
 +
CreditScoreRangeLower int,
 +
 
 +
CreditScoreRangeUpper int,
 +
 
 +
IncomeRange string
 +
 
 +
)row format delimited
 +
 
 +
fields terminated by ',';
 +
==数据清洗==
 +
导入另一个表
 +
 
 +
insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;
 +
 
 +
导入后的数据示例
 +
 
 +
Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"
 +
 
 +
Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"
 +
 
 +
Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed
 +
 
 +
Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999"
 +
 
 +
Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+"
 +
 
 +
Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+"
 +
 
 +
Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"
 +
 
 +
Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"
 +
 
 +
Current,0.0629,9,Food Service,Employed,TRUE,820,839,"$25,000-49,999"
 +
 
 +
Current,0.0629,11,Food Service,Employed,TRUE,820,839,"$25,000-49,999"
 +
 
 +
Current,0.2489,7,Fireman,Employed,FALSE,640,659,"$75,000-99,999"
 +
 
 +
Completed,0.1325,,Waiter/Waitress,Full-time,FALSE,640,659,"$1-24,999"
 +
 
 +
Past Due (1-15 days),0.1435,4,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"
 +
 
 +
Current,0.1034,8,Construction,Employed,TRUE,740,759,"$50,000-74,999"
 +
 
 +
Current,0.0949,8,Computer Programmer,Employed,FALSE,740,759,"$100,000+"
 +
 
 +
Defaulted,0.3177,5,Other,Other,TRUE,700,719,"$50,000-74,999"
 +
 
 +
Current,0.2809,4,Professional,Full-time,TRUE,640,659,"$75,000-99,999"
 +
 
 +
Chargedoff,0.125,,Professional,Not available,TRUE,760,779,Not displayed
 +
 
 +
Current,0.0974,7,Sales - Commission,Employed,FALSE,740,759,"$75,000-99,999"
 +
 
 +
Current,0.2225,8,Laborer,Employed,FALSE,680,699,"$25,000-49,999"
 +
 
 +
Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"
 +
 
 +
Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"

2020年11月17日 (二) 00:51的版本

数据分析

建表语句

建库

create database 库名;

建表,Prosper借贷平台 共81个字段

create table Prosper(

ListingKey string ,

ListingNumber int,

ListingCreationDate TIMESTAMP,

CreditGrade string,

Term int,

LoanStatus string,

ClosedDate TIMESTAMP,

BorrowerAPR float,

BorrowerRate float,

LenderYield float,

EstimatedEffectiveYield float,

EstimatedLoss float,

EstimatedReturn float,

ProsperRating(numeric) int,

ProsperRating (Alpha) string,

ProsperScore int,

ListingCategory (numeric) int,

BorrowerState string,

Occupation string,

EmploymentStatus string,

EmploymentStatusDuration int,

IsBorrowerHomeowner string,

CurrentlyInGroup string,

GroupKey string,

DateCreditPulled TIMESTAMP,

CreditScoreRangeLower int,

CreditScoreRangeUpper int,

FirstRecordedCreditLine TIMESTAMP,

CurrentCreditLines int,

OpenCreditLines int,

TotalCreditLinespast7years int,

OpenRevolvingAccounts int,

OpenRevolvingMonthlyPayment int,

InquiriesLast6Months int,

TotalInquiries int,

CurrentDelinquencies int,

AmountDelinquent int,

DelinquenciesLast7Years int,

PublicRecordsLast10Years int,

PublicRecordsLast12Months int,

RevolvingCreditBalance int,

BankcardUtilization float,

AvailableBankcardCredit int,

TotalTrades int,

TradesNeverDelinquent (percentage) float,

TradesOpenedLast6Months int,

DebtToIncomeRatio float,

IncomeRange string,

IncomeVerifiable string,

StatedMonthlyIncome double,

LoanKey string,

TotalProsperLoans int,

TotalProsperPaymentsBilled int,

OnTimeProsperPayments int,

ProsperPaymentsLessThanOneMonthLate int,

ProsperPaymentsOneMonthPlusLate int,

ProsperPrincipalBorrowed int,

ProsperPrincipalOutstanding double,

ScorexChangeAtTimeOfListing int,

LoanCurrentDaysDelinquent int,

LoanFirstDefaultedCycleNumber int,

LoanMonthsSinceOrigination int,

LoanNumber int,

LoanOriginalAmount int,

LoanOriginationDate TIMESTAMP,

LoanOriginationQuarter string,

MemberKey string,

MonthlyLoanPayment float,

LP_CustomerPayments double,

LP_CustomerPrincipalPayments double,

LP_InterestandFees double, LP_ServiceFees double,

LP_CollectionFees double,

LP_GrossPrincipalLoss double,

LP_NetPrincipalLoss double,

LP_NonPrincipalRecoverypayments double,

PercentFunded double,

Recommendations int,

InvestmentFromFriendsCount int,

InvestmentFromFriendsAmount double,

Investors int

)row format delimited

fields terminated by ',';

提取有用数据建表

create table test(

LoanStatus string,

BorrowerRate float,

ProsperScore int,

Occupation string,

EmploymentStatus string,

IsBorrowerHomeowner string,

CreditScoreRangeLower int,

CreditScoreRangeUpper int,

IncomeRange string

)row format delimited

fields terminated by ',';

数据清洗

导入另一个表

insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;

导入后的数据示例

Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"

Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"

Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed

Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999"

Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+"

Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+"

Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"

Current,0.0629,9,Food Service,Employed,TRUE,820,839,"$25,000-49,999"

Current,0.0629,11,Food Service,Employed,TRUE,820,839,"$25,000-49,999"

Current,0.2489,7,Fireman,Employed,FALSE,640,659,"$75,000-99,999"

Completed,0.1325,,Waiter/Waitress,Full-time,FALSE,640,659,"$1-24,999"

Past Due (1-15 days),0.1435,4,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.1034,8,Construction,Employed,TRUE,740,759,"$50,000-74,999"

Current,0.0949,8,Computer Programmer,Employed,FALSE,740,759,"$100,000+"

Defaulted,0.3177,5,Other,Other,TRUE,700,719,"$50,000-74,999"

Current,0.2809,4,Professional,Full-time,TRUE,640,659,"$75,000-99,999"

Chargedoff,0.125,,Professional,Not available,TRUE,760,779,Not displayed

Current,0.0974,7,Sales - Commission,Employed,FALSE,740,759,"$75,000-99,999"

Current,0.2225,8,Laborer,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"

Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"