當(dāng)前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

yolo标注的数据清洗

發(fā)布時間：2025/3/20 编程问答 21 豆豆

生活随笔收集整理的這篇文章主要介紹了 yolo标注的数据清洗小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

文章目錄

原
- 代碼
20200708 無需讀取圖片分辨率直接指定

原

在做yolo標(biāo)注時，有時我們需要剔除標(biāo)注中的一些錯誤的標(biāo)注或者超過一定范圍的標(biāo)注，

比如我們希望將中心點(diǎn)靠近圖像邊緣Δ距離的標(biāo)注框給剔除，可以使用以下方法

代碼

# -*- coding: utf-8 -*- """ @File : yolo_annotation_clean.py @Time : 2020/5/13 15:29 @Author : Dontla @Email : sxana@qq.com @Software: PyCharm """ import os import re import shutilimport cv2 import random# 排序函數(shù)，對文件列表進(jìn)行排序(filenames為文件夾文件的文件名的字符串列表，pattern為正則表達(dá)式，它是字符串類型) def sort_filenames(filenames, pattern):# （1）可以以len排序，len相同的字符串，會再以0-9排序，能獲得我們想要的結(jié)果# filenames.sort(key=len)# （2）這種排序失敗了# filenames.sort(key=lambda x: x[16:])# print(filenames[0][16:])# 1).txt# （3）用lambda配合正則表達(dá)式（將filenames中對象一一取出賦給x，通過冒號后的表達(dá)式運(yùn)算后將結(jié)果返回給key）# 數(shù)字字符串排序貌似還是以字符順序而不是數(shù)字大小來排的，可能要先轉(zhuǎn)化為數(shù)字（而re.findall('\((.*?)\)', x)返回的是字符串列表，要把它轉(zhuǎn)換成數(shù)字列表）filenames.sort(key=lambda x: list(map(eval, re.findall(pattern, x))))def extract_content(content_):# 注意，一開始用的第一種，結(jié)果只有一行的情況沒有被提取出來，要去掉后面的\n，謹(jǐn)記# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)\n', content)# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)', content)content_extract_ = re.findall('(\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*)', content_)return content_extract_if __name__ == '__main__':# 記得路徑尾部加“/”，不然調(diào)用join方法是它會用“\”替代，那樣不好，容易造成轉(zhuǎn)義字符問題。# ../表示上一層路徑# 以下三個路徑是相對當(dāng)前文件的source_img_path = './source_img/'source_txt_path = './source_txt/'target_txt_path = './target_txt/'# 讀取source_txt_path路徑下所有文件（包括子文件夾下文件）filenames = os.listdir(source_txt_path)# 調(diào)用自定義的sort_filenames函數(shù)對filenames重新排序（如果不重新排序它貌似會以1、10、100...的順序排而不是以1、2、3...的順序）# \是轉(zhuǎn)義字符# pattern = '\((.*?)\)'# Dontla 20200204 現(xiàn)在文件名就是純數(shù)字，所以pattern也得改pattern = '(.*?).txt'sort_filenames(filenames, pattern)# print(filenames)# ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']# 打開文件提取其中數(shù)字并將內(nèi)容重構(gòu)后寫入新文件for filename in filenames:# 打開文件：with open(os.path.join(source_txt_path, filename), 'r', encoding='utf-8') as f:# 讀取文本文件全部內(nèi)容content = f.read()# 提取數(shù)據(jù)content_extract = extract_content(content)# print(content_extract)# [('0', '0.631250', '0.270833', '0.156250', '0.277778'), ('0', '0.372656', '0.861111', '0.156250', '0.277778'), ('0', '0.448437', '0.447222', '0.156250', '0.277778'), ('0', '0.837500', '0.637500', '0.156250', '0.277778'), ('0', '0.155469', '0.268056', '0.156250', '0.277778')]# ...# 獲取當(dāng)前圖片分辨率信息（這樣不論圖片尺寸多少都能成功轉(zhuǎn)換）（re.findall()返回的是列表，需要將它轉(zhuǎn)換成字符串）# 讀取圖片img = cv2.imread('{}{}.jpg'.format(source_img_path, ''.join(re.findall('(.*?).txt', filename))))# 獲取圖片分辨率img_width = img.shape[1]img_height = img.shape[0]# print(img_height, img_width) # 720 1280# 創(chuàng)建寫入內(nèi)容變量write_content = ''# 讀取標(biāo)注框數(shù)據(jù)for box_tuple in content_extract:# 將元組字符串轉(zhuǎn)換成列表數(shù)字box_evar = list(map(eval, box_tuple))# print(box_evar)# [0, 0.63125, 0.270833, 0.15625, 0.277778]# ...# 映射變量class_id = box_evar[0]x, y = box_evar[1] * img_width, box_evar[2] * img_heightw, h = box_evar[3] * img_width, box_evar[4] * img_height# print(class_id, x, y, w, h)# 0 808.0 194.99975999999998 200.0 200.00016000000002# 【錯誤類篩選】if class_id != 0:print('【類標(biāo)注錯誤】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【中心點(diǎn)超出范圍】 elif x < 0 or x >= img_width or y < 0 or y >= img_height:print('【標(biāo)注框中心點(diǎn)超出圖片范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【標(biāo)注框頂點(diǎn)超出范圍】elif round(x - w / 2) < 0 \or round(x + w / 2) > img_width \or round(x - w / 2) >= round(x + w / 2) \or round(y - h / 2) < 0 \or round(y + h / 2) > img_height \or round(y - h / 2) >= round(y + h / 2):print('【標(biāo)注框頂點(diǎn)超出范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【框的長寬差太多】elif w / h > 2 or h / w > 2:print('【框的長寬比不合適】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【不是方形框】# elif w / h < 0.99:# print('【不是方形框】：')# print(filename)# print(box_evar)# print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),# round(y + h / 2), class_id))# print('\n')# continue# 【框太小或太大（邊長小于80或大于300）】elif w < 80 or w > 300 or h < 80 or h > 300:print('【標(biāo)注框大小有問題】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 將篩選后的標(biāo)注框加入到write_content中：write_content += '{} {} {} {} {}\n'.format(box_evar[0], box_evar[1], box_evar[2], box_evar[3],box_evar[4])# print(filename)# print(write_content)# 【去除空文件】# if write_content == '':# print('空文件：{}'.format(filename))# # print('content：{}'.format(content))# cv2.imwrite('null_img\\{}.jpg'.format(''.join(re.findall('(.*?).txt', filename))), img)# print('將圖片拷貝到“空文件”文件夾')# continue# else:# write_content = write_content.strip()# with open(os.path.join(target_txt_path, filename), 'w', encoding='utf-8') as f2:# f2.write(write_content)# 【不去除空文件】write_content = write_content.strip()with open(os.path.join(target_txt_path, filename), 'w', encoding='utf-8') as f2:f2.write(write_content)

20200708 無需讀取圖片分辨率直接指定

# -*- coding: utf-8 -*- """ @File : yolo_annotation_clean.py @Time : 2020/5/13 15:29 @Author : Dontla @Email : sxana@qq.com @Software: PyCharm """ import os import re import shutilimport cv2 import random# 排序函數(shù)，對文件列表進(jìn)行排序(filenames為文件夾文件的文件名的字符串列表，pattern為正則表達(dá)式，它是字符串類型) def sort_filenames(filenames, pattern):# （1）可以以len排序，len相同的字符串，會再以0-9排序，能獲得我們想要的結(jié)果# filenames.sort(key=len)# （2）這種排序失敗了# filenames.sort(key=lambda x: x[16:])# print(filenames[0][16:])# 1).txt# （3）用lambda配合正則表達(dá)式（將filenames中對象一一取出賦給x，通過冒號后的表達(dá)式運(yùn)算后將結(jié)果返回給key）# 數(shù)字字符串排序貌似還是以字符順序而不是數(shù)字大小來排的，可能要先轉(zhuǎn)化為數(shù)字（而re.findall('\((.*?)\)', x)返回的是字符串列表，要把它轉(zhuǎn)換成數(shù)字列表）filenames.sort(key=lambda x: list(map(eval, re.findall(pattern, x))))def extract_content(content_):# 注意，一開始用的第一種，結(jié)果只有一行的情況沒有被提取出來，要去掉后面的\n，謹(jǐn)記# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)\n', content)# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)', content)content_extract_ = re.findall('(\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*)', content_)return content_extract_if __name__ == '__main__':# 記得路徑尾部加“/”，不然調(diào)用join方法是它會用“\”替代，那樣不好，容易造成轉(zhuǎn)義字符問題。# ../表示上一層路徑# 以下三個路徑是相對當(dāng)前文件的source_txt_path = './source_txt/'target_txt_path = './target_txt/'# 獲取圖片分辨率img_width = 1280img_height = 720# 錯誤標(biāo)注計(jì)數(shù)器error_boxs_num = 0# 讀取source_txt_path路徑下所有文件（包括子文件夾下文件）filenames = os.listdir(source_txt_path)# 調(diào)用自定義的sort_filenames函數(shù)對filenames重新排序（如果不重新排序它貌似會以1、10、100...的順序排而不是以1、2、3...的順序）# \是轉(zhuǎn)義字符# pattern = '\((.*?)\)'# Dontla 20200204 現(xiàn)在文件名就是純數(shù)字，所以pattern也得改pattern = '(.*?).txt'sort_filenames(filenames, pattern)# print(filenames)# ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']# 打開文件提取其中數(shù)字并將內(nèi)容重構(gòu)后寫入新文件for filename in filenames:# 打開文件：with open(os.path.join(source_txt_path, filename), 'r', encoding='utf-8') as f:# 讀取文本文件全部內(nèi)容content = f.read()# 提取數(shù)據(jù)content_extract = extract_content(content)# 創(chuàng)建寫入內(nèi)容變量write_content = ''# 讀取標(biāo)注框數(shù)據(jù)for box_tuple in content_extract:# 將元組字符串轉(zhuǎn)換成列表數(shù)字box_evar = list(map(eval, box_tuple))# print(box_evar)# [0, 0.63125, 0.270833, 0.15625, 0.277778]# ...# 映射變量class_id = box_evar[0]x, y = box_evar[1] * img_width, box_evar[2] * img_heightw, h = box_evar[3] * img_width, box_evar[4] * img_height# print(class_id, x, y, w, h)# 0 808.0 194.99975999999998 200.0 200.00016000000002# 【錯誤類篩選】if class_id != 0:error_boxs_num += 1print('【類標(biāo)注錯誤】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【中心點(diǎn)超出范圍】 elif x < 0 or x >= img_width or y < 0 or y >= img_height:error_boxs_num += 1print('【標(biāo)注框中心點(diǎn)超出圖片范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【標(biāo)注框頂點(diǎn)超出范圍】elif round(x - w / 2) < 0 \or round(x + w / 2) > img_width \or round(x - w / 2) >= round(x + w / 2) \or round(y - h / 2) < 0 \or round(y + h / 2) > img_height \or round(y - h / 2) >= round(y + h / 2):error_boxs_num += 1print('【標(biāo)注框頂點(diǎn)超出范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【框的長寬差太多】elif w / h > 2 or h / w > 2:error_boxs_num += 1print('【框的長寬比不合適】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【框太小或太大（邊長小于80或大于300）】elif w < 100 or w > 300 or h < 100 or h > 300:error_boxs_num += 1print('【標(biāo)注框大小有問題】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 將篩選后的標(biāo)注框加入到write_content中：write_content += '{} {} {} {} {}\n'.format(box_evar[0], box_evar[1], box_evar[2], box_evar[3],box_evar[4])# 不去除空文件write_content = write_content.strip()with open(os.path.join(target_txt_path, filename), 'w', encoding='utf-8') as f2:f2.write(write_content)# 打印錯誤標(biāo)注框數(shù)量print('錯誤標(biāo)注框數(shù)量：{}'.format(error_boxs_num))

結(jié)果：

總結(jié)

以上是生活随笔為你收集整理的yolo标注的数据清洗的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

数据
Yolo