“数据清洗,抽取”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
数据清洗
建表语句
第1行: 第1行:
 
== 建表语句 ==
 
建库
 
 
'''create database 库名;'''
 
 
'''建表,Prosper借贷平台 共81个字段'''
 
 
create table Prosper(
 
 
ListingKey string ,
 
 
ListingNumber int,
 
 
ListingCreationDate TIMESTAMP,
 
 
CreditGrade string,
 
 
Term int,
 
 
LoanStatus string,
 
 
ClosedDate TIMESTAMP,
 
 
BorrowerAPR float,
 
 
BorrowerRate float,
 
 
LenderYield float,
 
 
EstimatedEffectiveYield float,
 
 
EstimatedLoss float,
 
 
EstimatedReturn float,
 
 
ProsperRating(numeric) int,
 
 
ProsperRating (Alpha) string,
 
 
ProsperScore int,
 
 
ListingCategory (numeric) int,
 
 
BorrowerState string,
 
 
Occupation string,
 
 
EmploymentStatus string,
 
 
EmploymentStatusDuration int,
 
 
IsBorrowerHomeowner string,
 
 
CurrentlyInGroup string,
 
 
GroupKey string,
 
 
DateCreditPulled TIMESTAMP,
 
 
CreditScoreRangeLower int,
 
 
CreditScoreRangeUpper int,
 
 
FirstRecordedCreditLine TIMESTAMP,
 
 
CurrentCreditLines int,
 
 
OpenCreditLines int,
 
 
TotalCreditLinespast7years int,
 
 
OpenRevolvingAccounts int,
 
 
OpenRevolvingMonthlyPayment int,
 
 
InquiriesLast6Months int,
 
 
TotalInquiries int,
 
 
CurrentDelinquencies int,
 
 
AmountDelinquent int,
 
 
DelinquenciesLast7Years int,
 
 
PublicRecordsLast10Years int,
 
 
PublicRecordsLast12Months int,
 
 
RevolvingCreditBalance int,
 
 
BankcardUtilization float,
 
 
AvailableBankcardCredit int,
 
 
TotalTrades int,
 
 
TradesNeverDelinquent (percentage) float,
 
 
TradesOpenedLast6Months int,
 
 
DebtToIncomeRatio float,
 
 
IncomeRange string,
 
 
IncomeVerifiable string,
 
 
StatedMonthlyIncome double,
 
 
LoanKey string,
 
 
TotalProsperLoans int,
 
 
TotalProsperPaymentsBilled int,
 
 
OnTimeProsperPayments int,
 
 
ProsperPaymentsLessThanOneMonthLate int,
 
 
ProsperPaymentsOneMonthPlusLate int,
 
 
ProsperPrincipalBorrowed int,
 
 
ProsperPrincipalOutstanding double,
 
 
ScorexChangeAtTimeOfListing int,
 
 
LoanCurrentDaysDelinquent int,
 
 
LoanFirstDefaultedCycleNumber int,
 
 
LoanMonthsSinceOrigination int,
 
 
LoanNumber int,
 
 
LoanOriginalAmount int,
 
 
LoanOriginationDate TIMESTAMP,
 
 
LoanOriginationQuarter string,
 
 
MemberKey string,
 
 
MonthlyLoanPayment float,
 
 
LP_CustomerPayments double,
 
 
LP_CustomerPrincipalPayments double,
 
 
LP_InterestandFees double,
 
LP_ServiceFees double,
 
 
LP_CollectionFees double,
 
 
LP_GrossPrincipalLoss double,
 
 
LP_NetPrincipalLoss double,
 
 
LP_NonPrincipalRecoverypayments double,
 
 
PercentFunded double,
 
 
Recommendations int,
 
 
InvestmentFromFriendsCount int,
 
 
InvestmentFromFriendsAmount double,
 
 
Investors int
 
 
)row format delimited
 
 
fields terminated by ',';
 
 
'''提取有用数据建表'''
 
 
create table test(
 
 
LoanStatus string,
 
 
BorrowerRate float,
 
 
ProsperScore int,
 
 
Occupation string,
 
 
EmploymentStatus string,
 
 
IsBorrowerHomeowner string,
 
 
CreditScoreRangeLower int,
 
 
CreditScoreRangeUpper int,
 
 
IncomeRange string
 
 
)row format delimited
 
 
fields terminated by ',';
 
  
 
==数据清洗==
 
==数据清洗==

2020年11月17日 (二) 01:10的版本

数据清洗

导入另一个表

insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;

导入后的数据示例

Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"

Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"

Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed

Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999"

Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+"

Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+"

Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"

Current,0.0629,9,Food Service,Employed,TRUE,820,839,"$25,000-49,999"

Current,0.0629,11,Food Service,Employed,TRUE,820,839,"$25,000-49,999"

Current,0.2489,7,Fireman,Employed,FALSE,640,659,"$75,000-99,999"

Completed,0.1325,,Waiter/Waitress,Full-time,FALSE,640,659,"$1-24,999"

Past Due (1-15 days),0.1435,4,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.1034,8,Construction,Employed,TRUE,740,759,"$50,000-74,999"

Current,0.0949,8,Computer Programmer,Employed,FALSE,740,759,"$100,000+"

Defaulted,0.3177,5,Other,Other,TRUE,700,719,"$50,000-74,999"

Current,0.2809,4,Professional,Full-time,TRUE,640,659,"$75,000-99,999"

Chargedoff,0.125,,Professional,Not available,TRUE,760,779,Not displayed

Current,0.0974,7,Sales - Commission,Employed,FALSE,740,759,"$75,000-99,999"

Current,0.2225,8,Laborer,Employed,FALSE,680,699,"$25,000-49,999"

Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"

Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"