利用Pandas清洗csv数据

来自CloudWiki
跳转至: 导航搜索

读取文件

import pandas as pd
import numpy as np
import csv
df = pd.read_csv(r'feizhu2.csv',low_memory=False)

获取列索引值

#获取列索引值
print(df.columns)#获取列索引值

插入新列

#插入新列
data1 = df2['国籍']#获取列名为flow的数据作为新列的数据
df2['所在区域'] = data1

df2.to_csv(r"test2.csv",mode = 'a',index =False)

数据筛选

筛选特定网址的数据行,

筛选特定价格的数据行:

#筛选
#print(df.loc[1:4,['国家','名称']])
df1 = df[(df['商品链接'].str.contains("https://traveldetail.fliggy.com",case=False, na=False))]
df2 = df1[((df1['价格'].astype('int'))>=50)]
#https://www.cnblogs.com/xiaobingqianrui/p/9996177.html

数据遍历和修改数据

asia = ("中国","朝鲜","韩国","蒙古","日本","越南","老挝","柬埔寨",
        "缅甸","泰国","马来西亚","新加坡","印度尼西亚","菲律宾",
        "文莱","东帝汶","斯里兰卡","马尔代夫","巴基斯坦","印度",
        "孟加拉国","尼泊尔","不丹","伊朗","土耳其","塞浦路斯",
        "叙利亚","黎巴嫩","巴勒斯坦","以色列","约旦","伊拉克",
        "科威特","沙特阿拉伯","也门","阿曼","阿拉伯联合酋长国",
        "卡塔尔","巴林","格鲁吉亚","亚美尼亚","阿塞拜疆","阿拉伯","长滩岛")
for index,row in df2.iterrows():
    
    print(row['国籍'],row['名称'],type(row['国籍']),type(row['名称']))
    
    if row['国籍'] in asia:
        df2['所在区域'][index] = '亚洲'

完整代码:

import pandas as pd
import numpy as np
import csv
df = pd.read_csv(r'feizhu2.csv',low_memory=False)
#获取列索引值
print(df.columns)#获取列索引值

#筛选
#print(df.loc[1:4,['国家','名称']])
df1 = df[(df['商品链接'].str.contains("https://traveldetail.fliggy.com",case=False, na=False))]
df2 = df1[((df1['价格'].astype('int'))>=50)]
#https://www.cnblogs.com/xiaobingqianrui/p/9996177.html

#
#插入新列
data1 = df2['国籍']#获取列名为flow的数据作为新列的数据
df2['所在区域'] = data1

df2.to_csv(r"test2.csv",mode = 'a',index =False)
#'''
asia = ("中国","朝鲜","韩国","蒙古","日本","越南","老挝","柬埔寨",
        "缅甸","泰国","马来西亚","新加坡","印度尼西亚","菲律宾",
        "文莱","东帝汶","斯里兰卡","马尔代夫","巴基斯坦","印度",
        "孟加拉国","尼泊尔","不丹","伊朗","土耳其","塞浦路斯",
        "叙利亚","黎巴嫩","巴勒斯坦","以色列","约旦","伊拉克",
        "科威特","沙特阿拉伯","也门","阿曼","阿拉伯联合酋长国",
        "卡塔尔","巴林","格鲁吉亚","亚美尼亚","阿塞拜疆","阿拉伯","长滩岛")

europe = ("奥地利","比利时","保加利亚","塞浦路斯","克罗地亚",
           "捷克共和国","丹麦","爱沙尼亚","芬兰","法国","德国",
           "希腊","匈牙利","爱尔兰","意大利","拉脱维亚","立陶宛",
           "卢森堡","马耳他","荷兰","波兰","葡萄牙","罗马尼亚",
           "斯洛伐克","斯洛文尼亚","西班牙","瑞典","英国")

america = ("阿根廷","巴哈马","伯利兹","美国","玻利维亚","巴西",
           "巴巴多斯","加拿大","哥伦比亚","智利","哥斯达黎加",
           "古巴","委内瑞拉","萨尔瓦多","厄瓜多尔","格林纳达",
           "危地马拉","圭亚那","洪都拉斯","海地","牙买加",
           "圣卢西亚","墨西哥","尼加拉瓜","巴拿马","秘鲁",
           "乌拉圭","巴拉圭","苏里南","多米尼加","多米尼克",
           "圣文森特和格林纳丁斯","特立尼达和多巴哥",
           "安提瓜和巴布","达圣基茨和尼维斯")

us_city = ('塞班','纽约','洛杉矶','华盛顿','旧金山','夏威夷','芝加哥','波士顿',
           '西雅图','迈阿密','休斯顿','拉斯维加斯','阿拉斯加','波特兰')

ocean =("澳大利亚","新西兰","巴布亚新几内亚","所罗门",
        "瓦努阿图","帕劳","瑙鲁","图瓦卢","基里巴斯","萨摩亚",
        "汤加","密克罗尼西亚","斐济","密克罗尼亚 ","马绍尔")

africa= ("阿尔及利亚","埃及","埃塞俄比亚","安哥拉","贝宁",
         "博茨瓦纳","布基纳法索","布隆迪","赤道几内亚",
         "多哥","厄立特里亚","佛得角","冈比亚","刚果",
         "吉布提","几内亚","几内亚比绍","加纳","加蓬",
         "津巴布韦","喀麦隆","科摩罗","科特迪瓦","肯尼亚",
         "莱索托","利比里亚","利比亚","卢旺达","马达加斯加",
         "马拉维","马里","毛里求斯","毛里塔尼亚","摩洛哥",
         "莫桑比克","纳米比亚","南非","南苏丹","尼日尔","尼日利亚"
         "塞拉利昂","塞内加尔","塞舌尔","圣多美和普林西比",
         "斯威士兰","苏丹","索马里","坦桑尼亚","突尼斯",
         "乌干达","赞比亚","乍得","中非")

#按行读取


for index,row in df2.iterrows():
    
    print(row['国籍'],row['名称'],type(row['国籍']),type(row['名称']))
    
    if row['国籍'] in asia:
        df2['所在区域'][index] = '亚洲'
        
        if row['国籍'] == '沙巴':
            df2['国籍'][index] = '马来西亚'
        
    elif row['国籍'] in ocean:
        df2['所在区域'][index] = '大洋洲'
        
    elif row['国籍'] in africa:
        df2['所在区域'][index] = '非洲'
        
    elif row['国籍'] in europe:
        df2['所在区域'][index] = '欧洲'
        
    elif row['国籍'] in america:
        df2['所在区域'][index] = '美洲'
        
    else:
        df2['所在区域'][index] = '未知'      
    
         

print(df2[1:5])
df2.to_csv(r"feizhu3.csv",mode = 'a',index =False)
#'''