“数据清洗,抽取”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
建表语句
 
(未显示2个用户的5个中间版本)
第1行: 第1行:
 +
1.爬取商城数据,并进行分析
  
== 建表语句 ==
+
'''具体步骤如下'''
建库
 
  
'''create database 库名;'''
 
  
'''建表,Prosper借贷平台 共81个字段'''
 
  
create table Prosper(
 
  
ListingKey string ,
 
  
ListingNumber int,
 
  
ListingCreationDate TIMESTAMP,
+
'''上传数据'''
  
CreditGrade string,
+
hadoop fs -mkdir -p /college
 +
hadoop fs -put /root/college/loan.csv /college
  
Term int,
 
  
LoanStatus string,
+
'''创建数据库'''
  
ClosedDate TIMESTAMP,
 
  
BorrowerAPR float,
+
create database 库名;
 
+
create table 表名1(
BorrowerRate float,
+
id int,
 
+
name string,
LenderYield float,
+
price int,
 
+
views int,
EstimatedEffectiveYield float,
+
sales int,
 
+
stock int
EstimatedLoss float,
+
)row format delimited
 
+
fields terminated by ',';
EstimatedReturn float,
 
 
 
ProsperRating(numeric) int,
 
 
 
ProsperRating (Alpha) string,
 
 
 
ProsperScore int,
 
 
 
ListingCategory (numeric) int,
 
 
 
BorrowerState string,
 
 
 
Occupation string,
 
 
 
EmploymentStatus string,
 
 
 
EmploymentStatusDuration int,
 
 
 
IsBorrowerHomeowner string,
 
 
 
CurrentlyInGroup string,
 
 
 
GroupKey string,
 
 
 
DateCreditPulled TIMESTAMP,
 
 
 
CreditScoreRangeLower int,
 
 
 
CreditScoreRangeUpper int,
 
 
 
FirstRecordedCreditLine TIMESTAMP,
 
 
 
CurrentCreditLines int,
 
 
 
OpenCreditLines int,
 
 
 
TotalCreditLinespast7years int,
 
 
 
OpenRevolvingAccounts int,
 
 
 
OpenRevolvingMonthlyPayment int,
 
 
 
InquiriesLast6Months int,
 
 
 
TotalInquiries int,
 
 
 
CurrentDelinquencies int,
 
 
 
AmountDelinquent int,
 
 
 
DelinquenciesLast7Years int,
 
 
 
PublicRecordsLast10Years int,
 
 
 
PublicRecordsLast12Months int,
 
 
 
RevolvingCreditBalance int,
 
 
 
BankcardUtilization float,
 
 
 
AvailableBankcardCredit int,
 
 
 
TotalTrades int,
 
 
 
TradesNeverDelinquent (percentage) float,
 
 
 
TradesOpenedLast6Months int,
 
 
 
DebtToIncomeRatio float,
 
 
 
IncomeRange string,
 
 
 
IncomeVerifiable string,
 
 
 
StatedMonthlyIncome double,
 
 
 
LoanKey string,
 
 
 
TotalProsperLoans int,
 
 
 
TotalProsperPaymentsBilled int,
 
 
 
OnTimeProsperPayments int,
 
 
 
ProsperPaymentsLessThanOneMonthLate int,
 
 
 
ProsperPaymentsOneMonthPlusLate int,
 
 
 
ProsperPrincipalBorrowed int,
 
 
 
ProsperPrincipalOutstanding double,
 
 
 
ScorexChangeAtTimeOfListing int,
 
 
 
LoanCurrentDaysDelinquent int,
 
 
 
LoanFirstDefaultedCycleNumber int,
 
  
LoanMonthsSinceOrigination int,
+
'''导入数据-》hive'''
  
LoanNumber int,
 
  
LoanOriginalAmount int,
+
load data inpath '/college/loan.csv' into table 表名;
  
LoanOriginationDate TIMESTAMP,
 
  
LoanOriginationQuarter string,
+
load data [local] inpath ‘/root/data’ into table psn; 表示本地而不是Hadoop
  
MemberKey string,
+
'''创建新表存储'''
  
MonthlyLoanPayment float,
+
create table 表名2 like 表名2;
  
LP_CustomerPayments double,
+
'''数据过滤'''
  
LP_CustomerPrincipalPayments double,
 
  
LP_InterestandFees double,
+
insert into 表名2
LP_ServiceFees double,
+
select * from 表名1
 +
where not id is null and name is null and price is null and views is null and sales is null and stock is null
 +
and name not like '%包包%' and name not like'%衣%' and name not like'%女士%'
 +
group by id,name,price,views,sales,stock;
  
LP_CollectionFees double,
 
  
LP_GrossPrincipalLoss double,
+
'''数据分析并下载'''
  
LP_NetPrincipalLoss double,
 
  
LP_NonPrincipalRecoverypayments double,
+
INSERT OVERWRITE LOCAL DIRECTORY '/root/college022/’
 +
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t’
 +
SELECT count(distinct author) FROM data;
  
PercentFunded double,
+
insert into table test partition(age='xx') select id,name,tel form perop;
 
 
Recommendations int,
 
 
 
InvestmentFromFriendsCount int,
 
 
 
InvestmentFromFriendsAmount double,
 
 
 
Investors int
 
 
 
)row format delimited
 
 
 
fields terminated by ',';
 
 
 
'''提取有用数据建表'''
 
 
 
create table test(
 
 
 
LoanStatus string,
 
 
 
BorrowerRate float,
 
 
 
ProsperScore int,
 
 
 
Occupation string,
 
 
 
EmploymentStatus string,
 
 
 
IsBorrowerHomeowner string,
 
 
 
CreditScoreRangeLower int,
 
 
 
CreditScoreRangeUpper int,
 
 
 
IncomeRange string
 
 
 
)row format delimited
 
 
 
fields terminated by ',';
 
  
==数据清洗==
+
'''数据切分'''
导入另一个表
 
  
insert into table test select LoanStatus,BorrowerRate,ProsperScore,Occupation,EmploymentStatus,IsBorrowerHomeowner,CreditScoreRangeLower,CreditScoreRangeUpper,IncomeRange form perop;
+
select split(name, '\ ') from data
 +
where split(name, '\ ')[0]='Huawei/华为';

2020年11月18日 (三) 14:16的最新版本

1.爬取商城数据,并进行分析

具体步骤如下




上传数据

hadoop fs -mkdir -p /college hadoop fs -put /root/college/loan.csv /college


创建数据库


create database 库名; create table 表名1( id int, name string, price int, views int, sales int, stock int )row format delimited fields terminated by ',';

导入数据-》hive


load data inpath '/college/loan.csv' into table 表名;


load data [local] inpath ‘/root/data’ into table psn; 表示本地而不是Hadoop

创建新表存储

create table 表名2 like 表名2;

数据过滤


insert into 表名2 select * from 表名1 where not id is null and name is null and price is null and views is null and sales is null and stock is null and name not like '%包包%' and name not like'%衣%' and name not like'%女士%' group by id,name,price,views,sales,stock;


数据分析并下载


INSERT OVERWRITE LOCAL DIRECTORY '/root/college022/’ ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t’ SELECT count(distinct author) FROM data;

insert into table test partition(age='xx') select id,name,tel form perop;

数据切分

select split(name, '\ ') from data where split(name, '\ ')[0]='Huawei/华为';