python对数据集进行清洗与可视化-专注的阿熊-ChinaUnix博客

专注的阿熊的ChinaUnix博客

首页　| 　博文目录　| 　关于我

专注的阿熊

博客访问： 3701013
博文数量： 365
博客积分： 0
博客等级：民兵
技术积分： 2522
用户组：普通用户
注册时间： 2019-10-28 13:40

文章分类

全部博文（365）

未分配的博文（365）

文章存档

2023年（8）

2022年（130）

2021年（155）

2020年（50）

2019年（22）

我的朋友

相关博文

python对数据集进行清洗与可视化

分类： Python/Ruby

2021-03-25 17:12:21

import os

import easygui as g

import glob

import pandas as pd

import xml.etree.ElementTree as ET

from tqdm import tqdm

import pandas_profiling

image_path = g.diropenbox( title= "请选择图像文件夹路径",default=r"E:\python\标定数据清洗\00001001-00001500image")#将default按照自己数据的位置设置，可以减轻繁琐操作

print(image_path)

xml_path = g.diropenbox( title= "请选择xml文件夹路径",default=r"E:\python\标定数据清洗\00001001-00001500xml")

print(xml_path)

image_lst = os.listdir(image_path)

xml_lst = os.listdir(xml_path)

print("image list:", len(image_lst))

print("xml list:", len(xml_lst))

print("————————功能1：显示命名不规划的xml文件———————————————————")

err_xml=[]

#显示命名不规划的xml文件

for xml in xml_lst:

if len(xml)!=12:#自己定义自己的命名规范格式

print(xml)

err_xml.append(xml)

if len(err_xml)==0:

print("无不规范命名的xml文件")

print("————————功能2：缺失xml文件显示——————————————————————————")

#缺失xml文件显示

missing_xml = []

for image in tqdm(image_lst):

xml = image[:-4] + '.xml'

if xml not in xml_lst:

missing_xml.append(xml[:-4])

print("缺失xml文件数：",len(missing_xml))

print("缺失xml文件为：",missing_xml)

print("————————功能3：缺失图像显示—————————————————————————————")

#缺失图像显示（或者说多余的xml）

missing_image = []

for xml in tqdm(xml_lst):

image = xml[:-4] + '.jpg'

if image not in image_lst:

missing_image.append(xml[:-4])

print("缺失image文件数：", len(missing_image))

print("缺失image文件为：", missing_image)

print("————————功能4：删除没有对应xml的图片—————————————————————")

drop_list1=[]

while len(missing_xml):

for index1 in missing_xml:

image = index1 + '.jpg'

os.remove(image_path + "\\" + image)

missing_xml.remove(index1)

drop_list1.append(index1)

if len(drop_list1)>0:

print("成功删除：",drop_list1)

else:

print("无缺失文件")

print("————————功能5：删除没有对应图片的xml文件——————————————————")

drop_list2=[]

while len(missing_image):

for index2 in missing_image:

xml = index2 + '.xml'

os.remove(xml_path + "\\" + xml)

missing_image.remove(index2)

drop_list2.append(index2)

if len(drop_list2)>0:

print("成功删除：",drop_list2)

else:

print("无缺失文件")

print("————————功能6：将xml文件写入csv文件——————————————————————")

#将xml文件写入csv文件，方便后期数据分析

def xml_to_csv(path):

xml_list = []

for xml_file in glob.glob(path + "\\" + '*.xml'):

# print(xml_file)

tree = ET.parse(xml_file)

root = tree.getroot()

for member in root.findall('object'):

value = (root.find('filename').text,

int(root.find('size')[0].text),

int(root.find('size')[1].text),

member[0].text,

int(member[4][0].text),

int(member[4][1].text),

int(member[4][2].text),

int(member[4][3].text)

)

xml_list.append(value)

column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']

xml_df = pd.DataFrame(xml_list, columns=column_name)

return xml_df

xml_df = xml_to_csv(xml_path)

xml_df.to_csv('labels.csv', index=None)

print('Successfully 货币符号converted xml to csv.')

print("————————功能7：查看xml文件信息，生成报告———————————————————")

def eda(in_file, out_file):

data = pd.read_csv(in_file, sep=',')

pfr = pandas_profiling.ProfileReport(data)

pfr.to_file(out_file)

in_file = 'labels.csv'

out_file = 'labels.html'

eda(in_file, out_file)

print('eda done!')

print("————————功能8：改写label出错的xml文件————————————————————")

def main(path):

wrong_class_lst1, wrong_class_lst2, w_lst = [], [], []

for xml_file in glob.glob(path + '*.xml'):

print(xml_file)

tree = ET.parse(xml_file)

root = tree.getroot()

for member in root.findall('object'):

value = member[0].text

if value == 'chemical_vehical' or value == 'chemcial_vehicle' or value == 'chemical_vehicel':

wrong_class_lst1.append(root.find('filename').text)

member[0].text = 'chemical_vehicle'

if value == 'chemical_sigh':

wrong_class_lst2.append(root.find('filename').text)

member[0].text = 'chemical_sign'

if value == 'w':

w_lst.append(root.find('filename').text)

tree.write(xml_file)

print('wrong_class_list1:', wrong_class_lst1)

print('wrong_class_list2:', wrong_class_lst1)

print('w_list:', w_lst)

main(xml_path)

print("完成！")

阅读(17604) | 评论(0) | 转发(0) |

上一篇：SkipList跳表

下一篇：感知机原理解析与代码实现

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6