==============================================================================*/ syntax = "proto2"; package waymo.open_dataset; import "waymo_open_dataset/label.proto"; import "waymo_open_dataset/protos/map.proto"; import "waymo_open_dataset/protos/vector.proto"; message MatrixShape { // Dimensions for the Matrix messages defined below. Must not be empty. // // The order of entries in 'dims' matters, as it indicates the layout of the // values in the tensor in-memory representation. // // The first entry in 'dims' is the outermost dimension used to lay out the // values; the last entry is the innermost dimension. This matches the // in-memory layout of row-major matrices. repeated int32 dims = 1; } // Row-major matrix. // Requires: data.size() = product(shape.dims()). message MatrixFloat { repeated float data = 1 [packed = true]; optional MatrixShape shape = 2; } // Row-major matrix. // Requires: data.size() = product(shape.dims()). message MatrixInt32 { repeated int32 data = 1 [packed = true]; optional MatrixShape shape = 2; } message CameraName { enum Name { UNKNOWN = 0; FRONT = 1; FRONT_LEFT = 2; FRONT_RIGHT = 3; SIDE_LEFT = 4; SIDE_RIGHT = 5; REAR_LEFT = 6; REAR = 7; REAR_RIGHT = 8; } } // 'Laser' is used interchangeably with 'Lidar' in this file. message LaserName { enum Name { UNKNOWN = 0; TOP = 1; FRONT = 2; SIDE_LEFT = 3; SIDE_RIGHT = 4; REAR = 5; } } // 4x4 row major transform matrix that tranforms 3d points from one frame to // another. message Transform { repeated double transform = 1; } message Velocity { // Velocity in m/s. optional float v_x = 1; optional float v_y = 2; optional float v_z = 3; // Angular velocity in rad/s. optional double w_x = 4; optional double w_y = 5; optional double w_z = 6; } message CameraCalibration { optional CameraName.Name name = 1; // 1d Array of [f_u, f_v, c_u, c_v, k{1, 2}, p{1, 2}, k{3}]. // Note that this intrinsic corresponds to the images after scaling. // Camera model: pinhole camera. // Lens distortion: // Radial distortion coefficients: k1, k2, k3. // Tangential distortion coefficients: p1, p2. // k_{1, 2, 3}, p_{1, 2} follows the same definition as OpenCV. // https://en.wikipedia.org/wiki/Distortion_(optics) // https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html repeated double intrinsic = 2; // Camera frame to vehicle frame. optional Transform extrinsic = 3; // Camera image size. optional int32 width = 4; optional int32 height = 5; enum RollingShutterReadOutDirection { UNKNOWN = 0; TOP_TO_BOTTOM = 1; LEFT_TO_RIGHT = 2; BOTTOM_TO_TOP = 3; RIGHT_TO_LEFT = 4; GLOBAL_SHUTTER = 5; } optional RollingShutterReadOutDirection rolling_shutter_direction = 6; } message LaserCalibration { optional LaserName.Name name = 1; // If non-empty, the beam pitch (in radians) is non-uniform. When constructing // a range image, this mapping is used to map from beam pitch to range image // row. If this is empty, we assume a uniform distribution. repeated double beam_inclinations = 2; // beam_inclination_{min,max} (in radians) are used to determine the mapping. optional double beam_inclination_min = 3; optional double beam_inclination_max = 4; // Lidar frame to vehicle frame. optional Transform extrinsic = 5; } message Context { // A unique name that identifies the frame sequence. optional string name = 1; repeated CameraCalibration camera_calibrations = 2; repeated LaserCalibration laser_calibrations = 3; // Some stats for the run segment used. message Stats { message ObjectCount { optional Label.Type type = 1; // The number of unique objects with the type in the segment. optional int32 count = 2; } repeated ObjectCount laser_object_counts = 1; repeated ObjectCount camera_object_counts = 5; // Day, Dawn/Dusk, or Night, determined from sun elevation. optional string time_of_day = 2; // Human readable location (e.g. CHD, SF) of the run segment. optional string location = 3; // Currently either Sunny or Rain. optional string weather = 4; } optional Stats stats = 4; } // Range image is a 2d tensor. The first dim (row) represents pitch. The second // dim represents yaw. // There are two types of range images: // 1. Raw range image: Raw range image with a non-empty // 'range_image_pose_compressed' which tells the vehicle pose of each // range image cell. // 2. Virtual range image: Range image with an empty // 'range_image_pose_compressed'. This range image is constructed by // transforming all lidar points into a fixed vehicle frame (usually the // vehicle frame of the middle scan). // NOTE: 'range_image_pose_compressed' is only populated for the first range // image return. The second return has the exact the same range image pose as // the first one. message RangeImage { // Zlib compressed [H, W, 4] serialized version of MatrixFloat. // To decompress: // string val = ZlibDecompress(range_image_compressed); // MatrixFloat range_image; // range_image.ParseFromString(val); // Inner dimensions are: // * channel 0: range // * channel 1: intensity // * channel 2: elongation // * channel 3: is in any no label zone. optional bytes range_image_compressed = 2; // Lidar point to camera image projections. A point can be projected to // multiple camera images. We pick the first two at the following order: // [FRONT, FRONT_LEFT, FRONT_RIGHT, SIDE_LEFT, SIDE_RIGHT]. // // Zlib compressed [H, W, 6] serialized version of MatrixInt32. // To decompress: // string val = ZlibDecompress(camera_projection_compressed); // MatrixInt32 camera_projection; // camera_projection.ParseFromString(val); // Inner dimensions are: // * channel 0: CameraName.Name of 1st projection. Set to UNKNOWN if no // projection. // * channel 1: x (axis along image width) // * channel 2: y (axis along image height) // * channel 3: CameraName.Name of 2nd projection. Set to UNKNOWN if no // projection. // * channel 4: x (axis along image width) // * channel 5: y (axis along image height) // Note: pixel 0 corresponds to the left edge of the first pixel in the image. optional bytes camera_projection_compressed = 3; // Zlib compressed [H, W, 6] serialized version of MatrixFloat. // To decompress: // string val = ZlibDecompress(range_image_pose_compressed); // MatrixFloat range_image_pose; // range_image_pose.ParseFromString(val); // Inner dimensions are [roll, pitch, yaw, x, y, z] represents a transform // from vehicle frame to global frame for every range image pixel. // This is ONLY populated for the first return. The second return is assumed // to have exactly the same range_image_pose_compressed. // // The roll, pitch and yaw are specified as 3-2-1 Euler angle rotations, // meaning that rotating from the navigation to vehicle frame consists of a // yaw, then pitch and finally roll rotation about the z, y and x axes // respectively. All rotations use the right hand rule and are positive // in the counter clockwise direction. optional bytes range_image_pose_compressed = 4; // Zlib compressed [H, W, 5] serialized version of MatrixFloat. // To decompress: // string val = ZlibDecompress(range_image_flow_compressed); // MatrixFloat range_image_flow; // range_image_flow.ParseFromString(val); // Inner dimensions are [vx, vy, vz, pointwise class]. // // If the point is not annotated with scene flow information, class is set // to -1. A point is not annotated if it is in a no-label zone or if its label // bounding box does not have a corresponding match in the previous frame, // making it infeasible to estimate the motion of the point. // Otherwise, (vx, vy, vz) are velocity along (x, y, z)-axis for this point // and class is set to one of the following values: // -1: no-flow-label, the point has no flow information. // 0: unlabeled or "background,", i.e., the point is not contained in a // bounding box. // 1: vehicle, i.e., the point corresponds to a vehicle label box. // 2: pedestrian, i.e., the point corresponds to a pedestrian label box. // 3: sign, i.e., the point corresponds to a sign label box. // 4: cyclist, i.e., the point corresponds to a cyclist label box. optional bytes range_image_flow_compressed = 5; // Zlib compressed [H, W, 2] serialized version of MatrixInt32. // To decompress: // string val = ZlibDecompress(segmentation_label_compressed); // MatrixInt32 segmentation_label. // segmentation_label.ParseFromString(val); // Inner dimensions are [instance_id, semantic_class]. // // NOTE: // 1. Only TOP LiDAR has segmentation labels. // 2. Not every frame has segmentation labels. This field is not set if a // frame is not labeled. // 3. There can be points missing segmentation labels within a labeled frame. // Their label are set to TYPE_NOT_LABELED when that happens. optional bytes segmentation_label_compressed = 6; // Deprecated, do not use. optional MatrixFloat range_image = 1 [deprecated = true]; } // Panoptic (instance + semantic) segmentation labels for a given camera image. // Associations can also be provided between each instance ID and a globally // unique ID across all frames. message CameraSegmentationLabel { // The value used to separate instance_ids from different semantic classes. // See the panoptic_label field for how this is used. Must be set to be // greater than the maximum instance_id. optional int32 panoptic_label_divisor = 1; // A uint16 png encoded image, with the same resolution as the corresponding // camera image. Each pixel contains a panoptic segmentation label, which is // computed as: // semantic_class_id * panoptic_label_divisor + instance_id. // We set instance_id = 0 for pixels for which there is no instance_id. // NOTE: Instance IDs in this label are only consistent within this camera // image. Use instance_id_to_global_id_mapping to get cross-camera consistent // instance IDs. optional bytes panoptic_label = 2; // A mapping between each panoptic label with an instance_id and a globally // unique ID across all frames within the same sequence. This can be used to // match instances across cameras and over time. i.e. instances belonging to // the same object will map to the same global ID across all frames in the // same sequence. // NOTE: These unique IDs are not consistent with other IDs in the dataset, // e.g. the bounding box IDs. message InstanceIDToGlobalIDMapping { optional int32 local_instance_id = 1; optional int32 global_instance_id = 2; // If false, the corresponding instance will not have consistent global ids // between frames. optional bool is_tracked = 3; } repeated InstanceIDToGlobalIDMapping instance_id_to_global_id_mapping = 3; // The sequence id for this label. The above instance_id_to_global_id_mapping // is only valid with other labels with the same sequence id. optional string sequence_id = 4; // A uint8 png encoded image, with the same resolution as the corresponding // camera image. The value on each pixel indicates the number of cameras that // overlap with this pixel. Used for the weighted Segmentation and Tracking // Quality (wSTQ) metric. optional bytes num_cameras_covered = 5; } // All timestamps in this proto are represented as seconds since Unix epoch. message CameraImage { optional CameraName.Name name = 1; // JPEG image. optional bytes image = 2; // SDC pose. optional Transform pose = 3; // SDC velocity at 'pose_timestamp' below. The velocity value is represented // at *global* frame. // With this velocity, the pose can be extrapolated. // r(t+dt) = r(t) + dr/dt * dt where dr/dt = v_{x,y,z}. // dR(t)/dt = W*R(t) where W = SkewSymmetric(w_{x,y,z}) // This differential equation solves to: R(t) = exp(Wt)*R(0) if W is constant. // When dt is small: R(t+dt) = (I+W*dt)R(t) // r(t) = (x(t), y(t), z(t)) is vehicle location at t in the global frame. // R(t) = Rotation Matrix (3x3) from the body frame to the global frame at t. // SkewSymmetric(x,y,z) is defined as the cross-product matrix in the // following: // https://en.wikipedia.org/wiki/Cross_product#Conversion_to_matrix_multiplication optional Velocity velocity = 4; // Timestamp of the `pose` above. optional double pose_timestamp = 5; // Rolling shutter params. // The following explanation assumes left->right rolling shutter. // // Rolling shutter cameras expose and read the image column by column, offset // by the read out time for each column. The desired timestamp for each column // is the middle of the exposure of that column as outlined below for an image // with 3 columns: // ------time------> // |---- exposure col 1----| read | // -------|---- exposure col 2----| read | // --------------|---- exposure col 3----| read | // ^trigger time ^readout end time // ^time for row 1 (= middle of exposure of row 1) // ^time image center (= middle of exposure of middle row) // Shutter duration in seconds. Exposure time per column. optional double shutter = 6; // Time when the sensor was triggered and when last readout finished. // The difference between trigger time and readout done time includes // the exposure time and the actual sensor readout time. optional double camera_trigger_time = 7; optional double camera_readout_done_time = 8; // Panoptic segmentation labels for this camera image. // NOTE: Not every image has panoptic segmentation labels. optional CameraSegmentationLabel camera_segmentation_label = 10; } // The camera labels associated with a given camera image. This message // indicates the ground truth information for the camera image // recorded by the given camera. If there are no labeled objects in the image, // then the labels field is empty. message CameraLabels { optional CameraName.Name name = 1; repeated Label labels = 2; } message Laser { optional LaserName.Name name = 1; optional RangeImage ri_return1 = 2; optional RangeImage ri_return2 = 3; } message Frame { // The following field numbers are reserved for third-party extensions. Users // may declare new fields in that range in their own .proto files without // having to edit the original file. extensions 1000 to max; // This context is the same for all frames belong to the same driving run // segment. Use context.name to identify frames belong to the same driving // segment. We do not store all frames from one driving segment in one proto // to avoid huge protos. optional Context context = 1; // Frame start time, which is the timestamp of the first top LiDAR scan // within this frame. Note that this timestamp does not correspond to the // provided vehicle pose (pose). optional int64 timestamp_micros = 2; // Frame vehicle pose. Note that unlike in CameraImage, the Frame pose does // not correspond to the provided timestamp (timestamp_micros). Instead, it // roughly (but not exactly) corresponds to the vehicle pose in the middle of // the given frame. The frame vehicle pose defines the coordinate system which // the 3D laser labels are defined in. optional Transform pose = 3; // The camera images. repeated CameraImage images = 4; // The LiDAR sensor data. repeated Laser lasers = 5; // Native 3D labels that correspond to the LiDAR sensor data. The 3D labels // are defined w.r.t. the frame vehicle pose coordinate system (pose). repeated Label laser_labels = 6; // The native 3D LiDAR labels (laser_labels) projected to camera images. A // projected label is the smallest image axis aligned rectangle that can cover // all projected points from the 3d LiDAR label. The projected label is // ignored if the projection is fully outside a camera image. The projected // label is clamped to the camera image if it is partially outside. repeated CameraLabels projected_lidar_labels = 9; // Native 2D camera labels. Note that if a camera identified by // CameraLabels.name has an entry in this field, then it has been labeled, // even though it is possible that there are no labeled objects in the // corresponding image, which is identified by a zero sized // CameraLabels.labels. repeated CameraLabels camera_labels = 8; // No label zones in the *global* frame. repeated Polygon2dProto no_label_zones = 7; // Map features. Only the first frame in a segment will contain map data. This // field will be empty for other frames as the map is identical for all // frames. repeated MapFeature map_features = 10; // Map pose offset. This offset must be added to lidar points from this frame // to compensate for pose drift and align with the map features. optional Vector3d map_pose_offset = 11; }