COCO Dataset Format

COCO (Common Objects in Context) dataset数据集是一个广泛应用于目标检测、语义分割的数据集，包含330K 图片数据与 2.5 million 个目标实体。

1.数据集下载

!wget http://images.cocodataset.org/zips/train2017.zip -O coco_train2017.zip
!wget http://images.cocodataset.org/zips/val2017.zip -O coco_val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O coco_ann2017.zip

2.数据集解压缩

from zipfile import ZipFile, BadZipFile
import os

def extract_zip_file(extract_path):
    try:
        with ZipFile(extract_path+".zip") as zfile:
            zfile.extractall(extract_path)

        # remove zipfile
        zfileTOremove=f"{extract_path}"+".zip"
        if os.path.isfile(zfileTOremove):
            os.remove(zfileTOremove)
        else:
            print("Error: %s file not found" % zfileTOremove)    

    except BadZipFile as e:
        print("Error:", e)


extract_train_path = "./coco_train2017"
extract_val_path = "./coco_val2017"
extract_ann_path="./coco_ann2017"

extract_zip_file(extract_train_path)
extract_zip_file(extract_val_path)
extract_zip_file(extract_ann_path)

解压缩后，coco_train2017 与 coco_val2017文件夹下包含子文件夹train2017与val2017，各自包含有图片数据.

而coco_ann2017 子文件夹下有6个JSON 格式annotation 文件位于子文件夹annotations.

3.数据集格式

随意调整一个format格式文件，说明如下：

{   
    "info": 
    { "description": "COCO 2017 Dataset",
        "url": "http://cocodataset.org",
        "version": "1.0",
        "year": 2017,
        "contributor": "COCO Consortium",
        "date_created": "2017/09/01"
    },
    "licenses": 
    [
        {"url": "http://creativecommons.org/licenses/by-nc-sa/2.0/","id": 1,"name": "Attribution-NonCommercial-ShareAlike License"},
        {"url": "http://creativecommons.org/licenses/by-nc/2.0/","id": 2,"name": "Attribution-NonCommercial License"},
        ...,
        ...
    ],
        
    "images":     
    [
        {"license": 4,"file_name": "000000397133.jpg","coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg","height": 427,"width": 640,"date_captured": "2013-11-14 17:02:52","flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg","id": 397133},
        {"license": 1,"file_name": "000000037777.jpg","coco_url": "http://images.cocodataset.org/val2017/000000037777.jpg","height": 230,"width": 352,"date_captured": "2013-11-14 20:55:31","flickr_url": "http://farm9.staticflickr.com/8429/7839199426_f6d48aa585_z.jpg","id": 37777},
        ...,
        ...
    ],
    "annotations": 
    [
        {"segmentation": [[510.66,423.01,511.72,...,...]],"area": 702.1057499999998,"iscrowd": 0,"image_id": 289343,"bbox": [473.07,395.93,38.65,28.67],"category_id": 18,"id": 1768},
        {"segmentation": [[289.74,443.39,302.29,...,...]],"area": 27718.476299999995,"iscrowd": 0,"image_id": 61471,"bbox": [272.1,200.23,151.97,279.77],"category_id": 18,"id": 1773}
        ...,
        ...
    ],
    "categories": 
    [
        {"supercategory": "person","id": 1,"name": "person"},
        {"supercategory": "vehicle","id": 2,"name": "bicycle"},
        ...,
        ...
    ]
}

info：

The “info” component provides metadata about the COCO dataset, including the version number, the date it was created, and the contact information for the creators of the dataset.

"info": 
{ "description": "COCO 2017 Dataset",
    "url": "http://cocodataset.org",
    "version": "1.0",
    "year": 2017,
    "contributor": "COCO Consortium",
    "date_created": "2017/09/01"
}

licenses

"license": an integer value indicating the license type of the image. This value corresponds to the license "id" in the "licenses" component.
"file_name": a string containing the name of the image file.
"coco_url": a string containing a URL to the image on the COCO website.
"height": an integer value representing the height of the image in pixels.
"width": an integer value representing the width of the image in pixels.
"date_captured": a string representing the date and time that the image was captured.
"flickr_url": a string containing a URL to the image on Flickr.
"id": a unique identifier for the image, as an integer value.

"licenses": 
[
    {
        "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/",
        "id": 1,
        "name": "Attribution-NonCommercial-ShareAlike License"
    },
    ...,
    ...,
    ...
]

images

"license": an integer value indicating the license type of the image. This value corresponds to the license "id" in the "licenses" component.
"file_name": a string containing the name of the image file.
"coco_url": a string containing a URL to the image on the COCO website.
"height": an integer value representing the height of the image in pixels.
"width": an integer value representing the width of the image in pixels.
"date_captured": a string representing the date and time that the image was captured.
"flickr_url": a string containing a URL to the image on Flickr.
"id": a unique identifier for the image, as an integer value.

"images":     
[
    {
        "license": 4,
        "file_name": "000000397133.jpg",
        "coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",
        "height": 427,
        "width": 640,
        "date_captured": "2013-11-14 17:02:52",
        "flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",
        "id": 397133
    },
    ...,
    ...,
    ...       
]

annotations

"segmentation": The “segmentation” key in an annotation dictionary holds a list of floating point numbers that represent the pixel coordinates of an object’s segmentation mask. They can be used to plot the segmentation mask of the object on an image. To plot the mask, we need to take pairs of numbers (the first and second value, then the third and fourth, etc.) and use them as the x and y coordinates of the pixels.
"area": a floating point value indicating the area of the object in segmentation mask in pixels squared.
"iscrowd": a binary integer value indicating whether the object is part of a crowd (1) or not (0).
"image_id": an integer value that is a unique identifier for the image in which the object appears. This "image_id" corresponds to the "id" in "image" component.
"bbox": a list of four floating point values representing the bounding box of the object in the format [x, y, width, height].
"category_id": an integer value indicating the category or class of the object.
"id": a unique identifier for the annotation across the entire COCO dataset, as an integer value.

"annotations": 
[
    {
        "segmentation": [[510.66,423.01,511.72,...,...,...]],
        "area": 702.1057499999998,
        "iscrowd": 0,
        "image_id": 289343,
        "bbox": [473.07,395.93,38.65,28.67],
        "category_id": 18,
        "id": 1768
    },
    ...,
    ...,
    ...
]

4.数据集解析

COCO dataset 以JSON 文件格式存储annotations,借助下属class可完成解析。

from collections import defaultdict
import json
import numpy as np
class COCOParser:
    def __init__(self, anns_file, imgs_dir):
        with open(anns_file, 'r') as f:
            coco = json.load(f)
            
        self.annIm_dict = defaultdict(list)        
        self.cat_dict = {} 
        self.annId_dict = {}
        self.im_dict = {}
        self.licenses_dict = {}
        for ann in coco['annotations']:           
            self.annIm_dict[ann['image_id']].append(ann) 
            self.annId_dict[ann['id']]=ann
        for img in coco['images']:
            self.im_dict[img['id']] = img
        for cat in coco['categories']:
            self.cat_dict[cat['id']] = cat
        for license in coco['licenses']:
            self.licenses_dict[license['id']] = license
    def get_imgIds(self):
        return list(self.im_dict.keys())
    def get_annIds(self, im_ids):
        im_ids=im_ids if isinstance(im_ids, list) else [im_ids]
        return [ann['id'] for im_id in im_ids for ann in self.annIm_dict[im_id]]
    def load_anns(self, ann_ids):
        im_ids=ann_ids if isinstance(ann_ids, list) else [ann_ids]
        return [self.annId_dict[ann_id] for ann_id in ann_ids]        
    def load_cats(self, class_ids):
        class_ids=class_ids if isinstance(class_ids, list) else [class_ids]
        return [self.cat_dict[class_id] for class_id in class_ids]
    def get_imgLicenses(self,im_ids):
        im_ids=im_ids if isinstance(im_ids, list) else [im_ids]
        lic_ids = [self.im_dict[im_id]["license"] for im_id in im_ids]
        return [self.licenses_dict[lic_id] for lic_id in lic_ids]

if __name__ == "__main__"
    coco_annotations_file="/content/coco_ann2017/annotations/instances_val2017.json"
    coco_images_dir="/content/coco_val2017/val2017"
    coco= COCOParser(coco_annotations_file, coco_images_dir)

5.数据可视化

import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
# define a list of colors for drawing bounding boxes
color_list = ["pink", "red", "teal", "blue", "orange", "yellow", "black", "magenta","green","aqua"]*10
num_imgs_to_disp = 4
total_images = len(coco.get_imgIds()) # total number of images
sel_im_idxs = np.random.permutation(total_images)[:num_imgs_to_disp]
img_ids = coco.get_imgIds()
selected_img_ids = [img_ids[i] for i in sel_im_idxs]
ann_ids = coco.get_annIds(selected_img_ids)
im_licenses = coco.get_imgLicenses(selected_img_ids)
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
ax = ax.ravel()
for i, im in enumerate(selected_img_ids):
    image = Image.open(f"{coco_images_dir}/{str(im).zfill(12)}.jpg")
    ann_ids = coco.get_annIds(im)
    annotations = coco.load_anns(ann_ids)
    for ann in annotations:
        bbox = ann['bbox']
        x, y, w, h = [int(b) for b in bbox]
        class_id = ann["category_id"]
        class_name = coco.load_cats(class_id)[0]["name"]
        license = coco.get_imgLicenses(im)[0]["name"]
        color_ = color_list[class_id]
        rect = plt.Rectangle((x, y), w, h, linewidth=2, edgecolor=color_, facecolor='none')
        t_box=ax[i].text(x, y, class_name,  color='red', fontsize=10)
        t_box.set_bbox(dict(boxstyle='square, pad=0',facecolor='white', alpha=0.6, edgecolor='blue'))
        ax[i].add_patch(rect)
    
    ax[i].axis('off')
    ax[i].imshow(image)
    ax[i].set_xlabel('Longitude')
    ax[i].set_title(f"License: {license}")
plt.tight_layout()
plt.show()