利用Pandas清洗csv数据
来自CloudWiki
读取文件
import pandas as pd import numpy as np import csv df = pd.read_csv(r'feizhu2.csv',low_memory=False)
获取列索引值
#获取列索引值 print(df.columns)#获取列索引值
插入新列
#插入新列 data1 = df2['国籍']#获取列名为flow的数据作为新列的数据 df2['所在区域'] = data1 df2.to_csv(r"test2.csv",mode = 'a',index =False)
数据筛选
筛选特定网址的数据行,
筛选特定价格的数据行:
#筛选 #print(df.loc[1:4,['国家','名称']]) df1 = df[(df['商品链接'].str.contains("https://traveldetail.fliggy.com",case=False, na=False))] df2 = df1[((df1['价格'].astype('int'))>=50)] #https://www.cnblogs.com/xiaobingqianrui/p/9996177.html
数据遍历和修改数据
asia = ("中国","朝鲜","韩国","蒙古","日本","越南","老挝","柬埔寨", "缅甸","泰国","马来西亚","新加坡","印度尼西亚","菲律宾", "文莱","东帝汶","斯里兰卡","马尔代夫","巴基斯坦","印度", "孟加拉国","尼泊尔","不丹","伊朗","土耳其","塞浦路斯", "叙利亚","黎巴嫩","巴勒斯坦","以色列","约旦","伊拉克", "科威特","沙特阿拉伯","也门","阿曼","阿拉伯联合酋长国", "卡塔尔","巴林","格鲁吉亚","亚美尼亚","阿塞拜疆","阿拉伯","长滩岛") for index,row in df2.iterrows(): print(row['国籍'],row['名称'],type(row['国籍']),type(row['名称'])) if row['国籍'] in asia: df2['所在区域'][index] = '亚洲'
完整代码:
import pandas as pd import numpy as np import csv df = pd.read_csv(r'feizhu2.csv',low_memory=False) #获取列索引值 print(df.columns)#获取列索引值 #筛选 #print(df.loc[1:4,['国家','名称']]) df1 = df[(df['商品链接'].str.contains("https://traveldetail.fliggy.com",case=False, na=False))] df2 = df1[((df1['价格'].astype('int'))>=50)] #https://www.cnblogs.com/xiaobingqianrui/p/9996177.html # #插入新列 data1 = df2['国籍']#获取列名为flow的数据作为新列的数据 df2['所在区域'] = data1 df2.to_csv(r"test2.csv",mode = 'a',index =False) #''' asia = ("中国","朝鲜","韩国","蒙古","日本","越南","老挝","柬埔寨", "缅甸","泰国","马来西亚","新加坡","印度尼西亚","菲律宾", "文莱","东帝汶","斯里兰卡","马尔代夫","巴基斯坦","印度", "孟加拉国","尼泊尔","不丹","伊朗","土耳其","塞浦路斯", "叙利亚","黎巴嫩","巴勒斯坦","以色列","约旦","伊拉克", "科威特","沙特阿拉伯","也门","阿曼","阿拉伯联合酋长国", "卡塔尔","巴林","格鲁吉亚","亚美尼亚","阿塞拜疆","阿拉伯","长滩岛") europe = ("奥地利","比利时","保加利亚","塞浦路斯","克罗地亚", "捷克共和国","丹麦","爱沙尼亚","芬兰","法国","德国", "希腊","匈牙利","爱尔兰","意大利","拉脱维亚","立陶宛", "卢森堡","马耳他","荷兰","波兰","葡萄牙","罗马尼亚", "斯洛伐克","斯洛文尼亚","西班牙","瑞典","英国") america = ("阿根廷","巴哈马","伯利兹","美国","玻利维亚","巴西", "巴巴多斯","加拿大","哥伦比亚","智利","哥斯达黎加", "古巴","委内瑞拉","萨尔瓦多","厄瓜多尔","格林纳达", "危地马拉","圭亚那","洪都拉斯","海地","牙买加", "圣卢西亚","墨西哥","尼加拉瓜","巴拿马","秘鲁", "乌拉圭","巴拉圭","苏里南","多米尼加","多米尼克", "圣文森特和格林纳丁斯","特立尼达和多巴哥", "安提瓜和巴布","达圣基茨和尼维斯") us_city = ('塞班','纽约','洛杉矶','华盛顿','旧金山','夏威夷','芝加哥','波士顿', '西雅图','迈阿密','休斯顿','拉斯维加斯','阿拉斯加','波特兰') ocean =("澳大利亚","新西兰","巴布亚新几内亚","所罗门", "瓦努阿图","帕劳","瑙鲁","图瓦卢","基里巴斯","萨摩亚", "汤加","密克罗尼西亚","斐济","密克罗尼亚 ","马绍尔") africa= ("阿尔及利亚","埃及","埃塞俄比亚","安哥拉","贝宁", "博茨瓦纳","布基纳法索","布隆迪","赤道几内亚", "多哥","厄立特里亚","佛得角","冈比亚","刚果", "吉布提","几内亚","几内亚比绍","加纳","加蓬", "津巴布韦","喀麦隆","科摩罗","科特迪瓦","肯尼亚", "莱索托","利比里亚","利比亚","卢旺达","马达加斯加", "马拉维","马里","毛里求斯","毛里塔尼亚","摩洛哥", "莫桑比克","纳米比亚","南非","南苏丹","尼日尔","尼日利亚" "塞拉利昂","塞内加尔","塞舌尔","圣多美和普林西比", "斯威士兰","苏丹","索马里","坦桑尼亚","突尼斯", "乌干达","赞比亚","乍得","中非") #按行读取 for index,row in df2.iterrows(): print(row['国籍'],row['名称'],type(row['国籍']),type(row['名称'])) if row['国籍'] in asia: df2['所在区域'][index] = '亚洲' if row['国籍'] == '沙巴': df2['国籍'][index] = '马来西亚' elif row['国籍'] in ocean: df2['所在区域'][index] = '大洋洲' elif row['国籍'] in africa: df2['所在区域'][index] = '非洲' elif row['国籍'] in europe: df2['所在区域'][index] = '欧洲' elif row['国籍'] in america: df2['所在区域'][index] = '美洲' else: df2['所在区域'][index] = '未知' print(df2[1:5]) df2.to_csv(r"feizhu3.csv",mode = 'a',index =False) #'''