“2020红亚杯:借贷数据分析”的版本间的差异
(→数据分析) |
|||
(未显示2个用户的5个中间版本) | |||
第3行: | 第3行: | ||
Prosper是美国一家p2p在在线借贷平台,网站可撮合一些有闲钱的人和基于用钱的热,用户若有贷款需求,可在网站上列出期望数额和可承受最大利率,潜在贷方则为数额和利率展开竞价,以下是该网站的部分数据 | Prosper是美国一家p2p在在线借贷平台,网站可撮合一些有闲钱的人和基于用钱的热,用户若有贷款需求,可在网站上列出期望数额和可承受最大利率,潜在贷方则为数额和利率展开竞价,以下是该网站的部分数据 | ||
− | LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange | + | <nowiki>LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange |
Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999" | Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999" | ||
第19行: | 第19行: | ||
Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999" | Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999" | ||
− | Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999" | + | Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"</nowiki> |
[[文件:12345646876545641.jpg|750px]] | [[文件:12345646876545641.jpg|750px]] | ||
第27行: | 第27行: | ||
'''create database 库名;''' | '''create database 库名;''' | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
'''提取有用数据建表''' | '''提取有用数据建表''' | ||
第222行: | 第53行: | ||
fields terminated by ','; | fields terminated by ','; | ||
+ | |||
==数据清洗== | ==数据清洗== | ||
导入另一个表 | 导入另一个表 | ||
第229行: | 第61行: | ||
导入后的数据示例 | 导入后的数据示例 | ||
− | Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999" | + | <nowiki>Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999" |
Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999" | Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999" | ||
第271行: | 第103行: | ||
Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999" | Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999" | ||
− | Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999" | + | Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"</nowiki> |
第287行: | 第119行: | ||
group by Occupation order by a desc limit 5; | group by Occupation order by a desc limit 5; | ||
− | ''' | + | '''违约职业前3名,写入/root/college002/''' |
INSERT OVERWRITE LOCAL DIRECTORY '/root/college002/' | INSERT OVERWRITE LOCAL DIRECTORY '/root/college002/' | ||
第299行: | 第131行: | ||
group by Occupation order by a desc limit 3; | group by Occupation order by a desc limit 3; | ||
− | '''求违约次数最高的职业的支持度,写入/root/ | + | '''求违约次数最高的职业的支持度,写入/root/college003/ 支持度=违约,职业/总''' |
+ | |||
+ | [[文件:5454545454.png]] | ||
select Occupation,count(*) as a from loan | select Occupation,count(*) as a from loan | ||
第314行: | 第148行: | ||
select round(第一个sql结果/count(*),5) from loan ; | select round(第一个sql结果/count(*),5) from loan ; | ||
+ | |||
+ | == 数据分析 == | ||
+ | |||
+ | |||
+ | '''信用得分对借款的影响''' | ||
+ | |||
+ | select ProsperScore,avg(BorrowerRate) as t from loan group by ProsperScore; | ||
+ | |||
+ | #职业对借款难易的影响 (拿不准) | ||
+ | |||
+ | select Occupation,avg(BorrowerRate) from loan group by Occupation; | ||
+ | |||
+ | '''违约客户的就业信息分布情况''' | ||
+ | |||
+ | select Occupation,count(*) from loan | ||
+ | |||
+ | where LoanStatus='Defaulted' or LoanStatus='Chargedoff' | ||
+ | |||
+ | group by Occupation; | ||
+ | |||
+ | '''贷款人的收入范围统计''' | ||
+ | |||
+ | select IncomeRange,count(*) from loan | ||
+ | |||
+ | group by IncomeRange; | ||
+ | |||
+ | '''不同职业的信用得分分布''' | ||
+ | |||
+ | select Occupation,ProsperScore,count(ProsperScore) from loan | ||
+ | |||
+ | group by Occupation,ProsperScore; |
2020年11月28日 (六) 14:31的最新版本
题干
Prosper是美国一家p2p在在线借贷平台,网站可撮合一些有闲钱的人和基于用钱的热,用户若有贷款需求,可在网站上列出期望数额和可承受最大利率,潜在贷方则为数额和利率展开竞价,以下是该网站的部分数据
LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999" Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999" Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999" Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+" Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+" Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999" Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"
建表语句
建库
create database 库名;
提取有用数据建表
create table test(
LoanStatus string,
BorrowerRate float,
ProsperScore int,
Occupation string,
EmploymentStatus string,
IsBorrowerHomeowner string,
CreditScoreRangeLower int,
CreditScoreRangeUpper int,
IncomeRange string
)row format delimited
fields terminated by ',';
数据清洗
导入另一个表
insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;
导入后的数据示例
Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999" Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999" Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999" Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+" Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+" Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999" Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999" Current,0.0629,9,Food Service,Employed,TRUE,820,839,"$25,000-49,999" Current,0.0629,11,Food Service,Employed,TRUE,820,839,"$25,000-49,999" Current,0.2489,7,Fireman,Employed,FALSE,640,659,"$75,000-99,999" Completed,0.1325,,Waiter/Waitress,Full-time,FALSE,640,659,"$1-24,999" Past Due (1-15 days),0.1435,4,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999" Current,0.1034,8,Construction,Employed,TRUE,740,759,"$50,000-74,999" Current,0.0949,8,Computer Programmer,Employed,FALSE,740,759,"$100,000+" Defaulted,0.3177,5,Other,Other,TRUE,700,719,"$50,000-74,999" Current,0.2809,4,Professional,Full-time,TRUE,640,659,"$75,000-99,999" Chargedoff,0.125,,Professional,Not available,TRUE,760,779,Not displayed Current,0.0974,7,Sales - Commission,Employed,FALSE,740,759,"$75,000-99,999" Current,0.2225,8,Laborer,Employed,FALSE,680,699,"$25,000-49,999" Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999" Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"
数据分析
跟据不同职业的信用分中位数作为该职业信用得分,分析某职业信用得分 由大到小top5,写入/root/college001/
INSERT OVERWRITE LOCAL DIRECTORY '/root/college001/'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select Occupation,percentile(ProsperScore,0.5) as a from loan
group by Occupation order by a desc limit 5;
违约职业前3名,写入/root/college002/
INSERT OVERWRITE LOCAL DIRECTORY '/root/college002/'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select Occupation,count(*) as a from loan
where LoanStatus='Defaulted' and Occupation<>'\N'
group by Occupation order by a desc limit 3;
求违约次数最高的职业的支持度,写入/root/college003/ 支持度=违约,职业/总
select Occupation,count(*) as a from loan
where LoanStatus='Defaulted'
group by Occupation order by a desc limit 1;
INSERT OVERWRITE LOCAL DIRECTORY '/root/college006/'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select round(第一个sql结果/count(*),5) from loan ;
数据分析
信用得分对借款的影响
select ProsperScore,avg(BorrowerRate) as t from loan group by ProsperScore;
- 职业对借款难易的影响 (拿不准)
select Occupation,avg(BorrowerRate) from loan group by Occupation;
违约客户的就业信息分布情况
select Occupation,count(*) from loan
where LoanStatus='Defaulted' or LoanStatus='Chargedoff'
group by Occupation;
贷款人的收入范围统计
select IncomeRange,count(*) from loan
group by IncomeRange;
不同职业的信用得分分布
select Occupation,ProsperScore,count(ProsperScore) from loan
group by Occupation,ProsperScore;