|
|
(未显示1个用户的4个中间版本) |
第1行: |
第1行: |
| + | 1.爬取商城数据,并进行分析 |
| | | |
− | == 建表语句 ==
| + | '''具体步骤如下''' |
− | 建库
| |
| | | |
− | '''create database 库名;'''
| |
| | | |
− | '''建表,Prosper借贷平台 共81个字段'''
| |
| | | |
− | create table Prosper(
| |
| | | |
− | ListingKey string ,
| |
| | | |
− | ListingNumber int,
| |
| | | |
− | ListingCreationDate TIMESTAMP,
| + | '''上传数据''' |
| | | |
− | CreditGrade string,
| + | hadoop fs -mkdir -p /college |
| + | hadoop fs -put /root/college/loan.csv /college |
| | | |
− | Term int,
| |
| | | |
− | LoanStatus string,
| + | '''创建数据库''' |
| | | |
− | ClosedDate TIMESTAMP,
| |
− |
| |
− | BorrowerAPR float,
| |
− |
| |
− | BorrowerRate float,
| |
− |
| |
− | LenderYield float,
| |
− |
| |
− | EstimatedEffectiveYield float,
| |
− |
| |
− | EstimatedLoss float,
| |
− |
| |
− | EstimatedReturn float,
| |
− |
| |
− | ProsperRating(numeric) int,
| |
− |
| |
− | ProsperRating (Alpha) string,
| |
− |
| |
− | ProsperScore int,
| |
− |
| |
− | ListingCategory (numeric) int,
| |
− |
| |
− | BorrowerState string,
| |
− |
| |
− | Occupation string,
| |
− |
| |
− | EmploymentStatus string,
| |
− |
| |
− | EmploymentStatusDuration int,
| |
− |
| |
− | IsBorrowerHomeowner string,
| |
− |
| |
− | CurrentlyInGroup string,
| |
− |
| |
− | GroupKey string,
| |
− |
| |
− | DateCreditPulled TIMESTAMP,
| |
− |
| |
− | CreditScoreRangeLower int,
| |
− |
| |
− | CreditScoreRangeUpper int,
| |
− |
| |
− | FirstRecordedCreditLine TIMESTAMP,
| |
− |
| |
− | CurrentCreditLines int,
| |
− |
| |
− | OpenCreditLines int,
| |
− |
| |
− | TotalCreditLinespast7years int,
| |
− |
| |
− | OpenRevolvingAccounts int,
| |
− |
| |
− | OpenRevolvingMonthlyPayment int,
| |
− |
| |
− | InquiriesLast6Months int,
| |
− |
| |
− | TotalInquiries int,
| |
− |
| |
− | CurrentDelinquencies int,
| |
− |
| |
− | AmountDelinquent int,
| |
− |
| |
− | DelinquenciesLast7Years int,
| |
− |
| |
− | PublicRecordsLast10Years int,
| |
− |
| |
− | PublicRecordsLast12Months int,
| |
− |
| |
− | RevolvingCreditBalance int,
| |
− |
| |
− | BankcardUtilization float,
| |
− |
| |
− | AvailableBankcardCredit int,
| |
− |
| |
− | TotalTrades int,
| |
− |
| |
− | TradesNeverDelinquent (percentage) float,
| |
− |
| |
− | TradesOpenedLast6Months int,
| |
− |
| |
− | DebtToIncomeRatio float,
| |
− |
| |
− | IncomeRange string,
| |
− |
| |
− | IncomeVerifiable string,
| |
− |
| |
− | StatedMonthlyIncome double,
| |
− |
| |
− | LoanKey string,
| |
− |
| |
− | TotalProsperLoans int,
| |
− |
| |
− | TotalProsperPaymentsBilled int,
| |
− |
| |
− | OnTimeProsperPayments int,
| |
− |
| |
− | ProsperPaymentsLessThanOneMonthLate int,
| |
− |
| |
− | ProsperPaymentsOneMonthPlusLate int,
| |
− |
| |
− | ProsperPrincipalBorrowed int,
| |
− |
| |
− | ProsperPrincipalOutstanding double,
| |
− |
| |
− | ScorexChangeAtTimeOfListing int,
| |
− |
| |
− | LoanCurrentDaysDelinquent int,
| |
− |
| |
− | LoanFirstDefaultedCycleNumber int,
| |
− |
| |
− | LoanMonthsSinceOrigination int,
| |
− |
| |
− | LoanNumber int,
| |
− |
| |
− | LoanOriginalAmount int,
| |
− |
| |
− | LoanOriginationDate TIMESTAMP,
| |
− |
| |
− | LoanOriginationQuarter string,
| |
− |
| |
− | MemberKey string,
| |
− |
| |
− | MonthlyLoanPayment float,
| |
− |
| |
− | LP_CustomerPayments double,
| |
− |
| |
− | LP_CustomerPrincipalPayments double,
| |
− |
| |
− | LP_InterestandFees double,
| |
− | LP_ServiceFees double,
| |
− |
| |
− | LP_CollectionFees double,
| |
− |
| |
− | LP_GrossPrincipalLoss double,
| |
− |
| |
− | LP_NetPrincipalLoss double,
| |
− |
| |
− | LP_NonPrincipalRecoverypayments double,
| |
− |
| |
− | PercentFunded double,
| |
− |
| |
− | Recommendations int,
| |
− |
| |
− | InvestmentFromFriendsCount int,
| |
− |
| |
− | InvestmentFromFriendsAmount double,
| |
− |
| |
− | Investors int
| |
− |
| |
− | )row format delimited
| |
− |
| |
− | fields terminated by ',';
| |
− |
| |
− | '''提取有用数据建表'''
| |
− |
| |
− | create table test(
| |
− |
| |
− | LoanStatus string,
| |
− |
| |
− | BorrowerRate float,
| |
− |
| |
− | ProsperScore int,
| |
− |
| |
− | Occupation string,
| |
− |
| |
− | EmploymentStatus string,
| |
− |
| |
− | IsBorrowerHomeowner string,
| |
− |
| |
− | CreditScoreRangeLower int,
| |
− |
| |
− | CreditScoreRangeUpper int,
| |
− |
| |
− | IncomeRange string
| |
| | | |
| + | create database 库名; |
| + | create table 表名1( |
| + | id int, |
| + | name string, |
| + | price int, |
| + | views int, |
| + | sales int, |
| + | stock int |
| )row format delimited | | )row format delimited |
− |
| |
| fields terminated by ','; | | fields terminated by ','; |
| | | |
− | ==数据清洗==
| + | '''导入数据-》hive''' |
− | 导入另一个表
| |
− | | |
− | insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;
| |
− | | |
− | 导入后的数据示例
| |
− | | |
− | Completed,0.158,,Other,Self-employed,TRUE,640,659,"$25,000-49,999"
| |
− | | |
− | Current,0.092,7,Professional,Employed,FALSE,680,699,"$50,000-74,999"
| |
− | | |
− | Completed,0.275,,Other,Not available,FALSE,480,499,Not displayed
| |
− | | |
− | Current,0.0974,9,Skilled Labor,Employed,TRUE,800,819,"$25,000-49,999"
| |
− | | |
− | Current,0.2085,4,Executive,Employed,TRUE,680,699,"$100,000+"
| |
− | | |
− | Current,0.1314,10,Professional,Employed,TRUE,740,759,"$100,000+"
| |
| | | |
− | Current,0.2712,2,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"
| |
| | | |
− | Current,0.2019,4,Laborer,Employed,FALSE,700,719,"$25,000-49,999"
| + | load data inpath '/college/loan.csv' into table 表名; |
| | | |
− | Current,0.0629,9,Food Service,Employed,TRUE,820,839,"$25,000-49,999"
| |
| | | |
− | Current,0.0629,11,Food Service,Employed,TRUE,820,839,"$25,000-49,999"
| + | load data [local] inpath ‘/root/data’ into table psn; 表示本地而不是Hadoop |
| | | |
− | Current,0.2489,7,Fireman,Employed,FALSE,640,659,"$75,000-99,999"
| + | '''创建新表存储''' |
| | | |
− | Completed,0.1325,,Waiter/Waitress,Full-time,FALSE,640,659,"$1-24,999"
| + | create table 表名2 like 表名2; |
| | | |
− | Past Due (1-15 days),0.1435,4,Sales - Retail,Employed,FALSE,680,699,"$25,000-49,999"
| + | '''数据过滤''' |
| | | |
− | Current,0.1034,8,Construction,Employed,TRUE,740,759,"$50,000-74,999"
| |
| | | |
− | Current,0.0949,8,Computer Programmer,Employed,FALSE,740,759,"$100,000+"
| + | insert into 表名2 |
| + | select * from 表名1 |
| + | where not id is null and name is null and price is null and views is null and sales is null and stock is null |
| + | and name not like '%包包%' and name not like'%衣%' and name not like'%女士%' |
| + | group by id,name,price,views,sales,stock; |
| | | |
− | Defaulted,0.3177,5,Other,Other,TRUE,700,719,"$50,000-74,999"
| |
| | | |
− | Current,0.2809,4,Professional,Full-time,TRUE,640,659,"$75,000-99,999"
| + | '''数据分析并下载''' |
| | | |
− | Chargedoff,0.125,,Professional,Not available,TRUE,760,779,Not displayed
| |
| | | |
− | Current,0.0974,7,Sales - Commission,Employed,FALSE,740,759,"$75,000-99,999"
| + | INSERT OVERWRITE LOCAL DIRECTORY '/root/college022/’ |
| + | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t’ |
| + | SELECT count(distinct author) FROM data; |
| | | |
− | Current,0.2225,8,Laborer,Employed,FALSE,680,699,"$25,000-49,999"
| + | insert into table test partition(age='xx') select id,name,tel form perop; |
| | | |
− | Current,0.1314,7,Retail Management,Employed,TRUE,660,679,"$50,000-74,999"
| + | '''数据切分''' |
| | | |
− | Completed,0.2075,,Professional,Full-time,FALSE,620,639,"$25,000-49,999"
| + | select split(name, '\ ') from data |
| + | where split(name, '\ ')[0]='Huawei/华为'; |