“2020红亚杯:借贷数据分析”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
数据分析
数据分析
 
(未显示2个用户的4个中间版本)
第3行: 第3行:
 
Prosper是美国一家p2p在在线借贷平台,网站可撮合一些有闲钱的人和基于用钱的热,用户若有贷款需求,可在网站上列出期望数额和可承受最大利率,潜在贷方则为数额和利率展开竞价,以下是该网站的部分数据
 
Prosper是美国一家p2p在在线借贷平台,网站可撮合一些有闲钱的人和基于用钱的热,用户若有贷款需求,可在网站上列出期望数额和可承受最大利率,潜在贷方则为数额和利率展开竞价,以下是该网站的部分数据
  
LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange
+
<nowiki>LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange
  
 
Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"
 
Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"
第19行: 第19行:
 
Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"
 
Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"
  
Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"
+
Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"</nowiki>
  
 
[[文件:12345646876545641.jpg|750px]]
 
[[文件:12345646876545641.jpg|750px]]
第27行: 第27行:
  
 
'''create database 库名;'''
 
'''create database 库名;'''
 
'''建表,Prosper借贷平台 共81个字段'''
 
 
create table Prosper(
 
 
ListingKey string ,
 
 
ListingNumber int,
 
 
ListingCreationDate TIMESTAMP,
 
 
CreditGrade string,
 
 
Term int,
 
 
LoanStatus string,
 
 
ClosedDate TIMESTAMP,
 
 
BorrowerAPR float,
 
 
BorrowerRate float,
 
 
LenderYield float,
 
 
EstimatedEffectiveYield float,
 
 
EstimatedLoss float,
 
 
EstimatedReturn float,
 
 
ProsperRating(numeric) int,
 
 
ProsperRating (Alpha) string,
 
 
ProsperScore int,
 
 
ListingCategory (numeric) int,
 
 
BorrowerState string,
 
 
Occupation string,
 
 
EmploymentStatus string,
 
 
EmploymentStatusDuration int,
 
 
IsBorrowerHomeowner string,
 
 
CurrentlyInGroup string,
 
 
GroupKey string,
 
 
DateCreditPulled TIMESTAMP,
 
 
CreditScoreRangeLower int,
 
 
CreditScoreRangeUpper int,
 
 
FirstRecordedCreditLine TIMESTAMP,
 
 
CurrentCreditLines int,
 
 
OpenCreditLines int,
 
 
TotalCreditLinespast7years int,
 
 
OpenRevolvingAccounts int,
 
 
OpenRevolvingMonthlyPayment int,
 
 
InquiriesLast6Months int,
 
 
TotalInquiries int,
 
 
CurrentDelinquencies int,
 
 
AmountDelinquent int,
 
 
DelinquenciesLast7Years int,
 
 
PublicRecordsLast10Years int,
 
 
PublicRecordsLast12Months int,
 
 
RevolvingCreditBalance int,
 
 
BankcardUtilization float,
 
 
AvailableBankcardCredit int,
 
 
TotalTrades int,
 
 
TradesNeverDelinquent (percentage) float,
 
 
TradesOpenedLast6Months int,
 
 
DebtToIncomeRatio float,
 
 
IncomeRange string,
 
 
IncomeVerifiable string,
 
 
StatedMonthlyIncome double,
 
 
LoanKey string,
 
 
TotalProsperLoans int,
 
 
TotalProsperPaymentsBilled int,
 
 
OnTimeProsperPayments int,
 
 
ProsperPaymentsLessThanOneMonthLate int,
 
 
ProsperPaymentsOneMonthPlusLate int,
 
 
ProsperPrincipalBorrowed int,
 
 
ProsperPrincipalOutstanding double,
 
 
ScorexChangeAtTimeOfListing int,
 
 
LoanCurrentDaysDelinquent int,
 
 
LoanFirstDefaultedCycleNumber int,
 
 
LoanMonthsSinceOrigination int,
 
 
LoanNumber int,
 
 
LoanOriginalAmount int,
 
 
LoanOriginationDate TIMESTAMP,
 
 
LoanOriginationQuarter string,
 
 
MemberKey string,
 
 
MonthlyLoanPayment float,
 
 
LP_CustomerPayments double,
 
 
LP_CustomerPrincipalPayments double,
 
 
LP_InterestandFees double,
 
LP_ServiceFees double,
 
 
LP_CollectionFees double,
 
 
LP_GrossPrincipalLoss double,
 
 
LP_NetPrincipalLoss double,
 
 
LP_NonPrincipalRecoverypayments double,
 
 
PercentFunded double,
 
 
Recommendations int,
 
 
InvestmentFromFriendsCount int,
 
 
InvestmentFromFriendsAmount double,
 
 
Investors int
 
 
)row format delimited
 
 
fields terminated by ',';
 
  
 
'''提取有用数据建表'''
 
'''提取有用数据建表'''
第222行: 第53行:
  
 
fields terminated by ',';
 
fields terminated by ',';
 +
 
==数据清洗==
 
==数据清洗==
 
导入另一个表
 
导入另一个表
第229行: 第61行:
 
导入后的数据示例
 
导入后的数据示例
  
Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"
+
<nowiki>Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"
  
 
Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"
 
Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"
第271行: 第103行:
 
Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"
 
Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"
  
Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"
+
Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"</nowiki>
  
  
第299行: 第131行:
 
group by Occupation order by a desc limit 3;
 
group by Occupation order by a desc limit 3;
  
'''求违约次数最高的职业的支持度,写入/root/college002/        支持度=违约,职业/总'''
+
'''求违约次数最高的职业的支持度,写入/root/college003/        支持度=违约,职业/总'''
 +
 
 +
[[文件:5454545454.png]]
  
 
select Occupation,count(*) as a from loan
 
select Occupation,count(*) as a from loan
第314行: 第148行:
  
 
select round(第一个sql结果/count(*),5) from loan ;
 
select round(第一个sql结果/count(*),5) from loan ;
 +
 +
== 数据分析 ==
 +
 +
 +
'''信用得分对借款的影响'''
 +
 +
select ProsperScore,avg(BorrowerRate) as t from loan group by ProsperScore;
 +
 +
#职业对借款难易的影响  (拿不准)
 +
 +
select Occupation,avg(BorrowerRate) from loan group by Occupation;
 +
 +
'''违约客户的就业信息分布情况'''
 +
 +
select Occupation,count(*) from loan
 +
 +
where LoanStatus='Defaulted' or LoanStatus='Chargedoff'
 +
 +
group by Occupation;
 +
 +
'''贷款人的收入范围统计'''
 +
 +
select IncomeRange,count(*) from loan
 +
 +
group by IncomeRange;
 +
 +
'''不同职业的信用得分分布'''
 +
 +
select Occupation,ProsperScore,count(ProsperScore) from loan
 +
 +
group by Occupation,ProsperScore;

2020年11月28日 (六) 14:31的最新版本

题干

Prosper是美国一家p2p在在线借贷平台,网站可撮合一些有闲钱的人和基于用钱的热,用户若有贷款需求,可在网站上列出期望数额和可承受最大利率,潜在贷方则为数额和利率展开竞价,以下是该网站的部分数据

LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange

Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"

Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"

Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed

Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999"

Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+"

Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+"

Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"

12345646876545641.jpg

建表语句

建库

create database 库名;

提取有用数据建表

create table test(

LoanStatus string,

BorrowerRate float,

ProsperScore int,

Occupation string,

EmploymentStatus string,

IsBorrowerHomeowner string,

CreditScoreRangeLower int,

CreditScoreRangeUpper int,

IncomeRange string

)row format delimited

fields terminated by ',';

数据清洗

导入另一个表

insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;

导入后的数据示例

Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"

Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"

Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed

Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999"

Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+"

Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+"

Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"

Current,0.0629,9,Food Service,Employed,TRUE,820,839,"$25,000-49,999"

Current,0.0629,11,Food Service,Employed,TRUE,820,839,"$25,000-49,999"

Current,0.2489,7,Fireman,Employed,FALSE,640,659,"$75,000-99,999"

Completed,0.1325,,Waiter/Waitress,Full-time,FALSE,640,659,"$1-24,999"

Past Due (1-15 days),0.1435,4,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.1034,8,Construction,Employed,TRUE,740,759,"$50,000-74,999"

Current,0.0949,8,Computer Programmer,Employed,FALSE,740,759,"$100,000+"

Defaulted,0.3177,5,Other,Other,TRUE,700,719,"$50,000-74,999"

Current,0.2809,4,Professional,Full-time,TRUE,640,659,"$75,000-99,999"

Chargedoff,0.125,,Professional,Not available,TRUE,760,779,Not displayed

Current,0.0974,7,Sales - Commission,Employed,FALSE,740,759,"$75,000-99,999"

Current,0.2225,8,Laborer,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"

Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"


数据分析

跟据不同职业的信用分中位数作为该职业信用得分,分析某职业信用得分 由大到小top5,写入/root/college001/

INSERT OVERWRITE LOCAL DIRECTORY '/root/college001/'

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

select Occupation,percentile(ProsperScore,0.5) as a from loan

group by Occupation order by a desc limit 5;

违约职业前3名,写入/root/college002/

INSERT OVERWRITE LOCAL DIRECTORY '/root/college002/'

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

select Occupation,count(*) as a from loan

where LoanStatus='Defaulted' and Occupation<>'\N'

group by Occupation order by a desc limit 3;

求违约次数最高的职业的支持度,写入/root/college003/ 支持度=违约,职业/总

5454545454.png

select Occupation,count(*) as a from loan

where LoanStatus='Defaulted'

group by Occupation order by a desc limit 1;


INSERT OVERWRITE LOCAL DIRECTORY '/root/college006/'

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

select round(第一个sql结果/count(*),5) from loan ;

数据分析

信用得分对借款的影响

select ProsperScore,avg(BorrowerRate) as t from loan group by ProsperScore;

  1. 职业对借款难易的影响 (拿不准)

select Occupation,avg(BorrowerRate) from loan group by Occupation;

违约客户的就业信息分布情况

select Occupation,count(*) from loan

where LoanStatus='Defaulted' or LoanStatus='Chargedoff'

group by Occupation;

贷款人的收入范围统计

select IncomeRange,count(*) from loan

group by IncomeRange;

不同职业的信用得分分布

select Occupation,ProsperScore,count(ProsperScore) from loan

group by Occupation,ProsperScore;