==============================================================================*/

syntax = "proto2";

package waymo.open_dataset;

import "waymo_open_dataset/label.proto";
import "waymo_open_dataset/protos/map.proto";
import "waymo_open_dataset/protos/vector.proto";

message MatrixShape {
  // Dimensions for the Matrix messages defined below. Must not be empty.
  //
  // The order of entries in 'dims' matters, as it indicates the layout of the
  // values in the tensor in-memory representation.
  //
  // The first entry in 'dims' is the outermost dimension used to lay out the
  // values; the last entry is the innermost dimension.  This matches the
  // in-memory layout of row-major matrices.
  repeated int32 dims = 1;
}

// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixFloat {
  repeated float data = 1 [packed = true];
  optional MatrixShape shape = 2;
}

// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixInt32 {
  repeated int32 data = 1 [packed = true];
  optional MatrixShape shape = 2;
}

message CameraName {
  enum Name {
    UNKNOWN = 0;
    FRONT = 1;
    FRONT_LEFT = 2;
    FRONT_RIGHT = 3;
    SIDE_LEFT = 4;
    SIDE_RIGHT = 5;
    REAR_LEFT = 6;
    REAR = 7;
    REAR_RIGHT = 8;

  }
}

// 'Laser' is used interchangeably with 'Lidar' in this file.
message LaserName {
  enum Name {
    UNKNOWN = 0;
    TOP = 1;
    FRONT = 2;
    SIDE_LEFT = 3;
    SIDE_RIGHT = 4;
    REAR = 5;

  }
}

// 4x4 row major transform matrix that tranforms 3d points from one frame to
// another.
message Transform {
  repeated double transform = 1;
}

message Velocity {
  // Velocity in m/s.
  optional float v_x = 1;
  optional float v_y = 2;
  optional float v_z = 3;

  // Angular velocity in rad/s.
  optional double w_x = 4;
  optional double w_y = 5;
  optional double w_z = 6;
}

message CameraCalibration {
  optional CameraName.Name name = 1;
  // 1d Array of [f_u, f_v, c_u, c_v, k{1, 2}, p{1, 2}, k{3}].
  // Note that this intrinsic corresponds to the images after scaling.
  // Camera model: pinhole camera.
  // Lens distortion:
  //   Radial distortion coefficients: k1, k2, k3.
  //   Tangential distortion coefficients: p1, p2.
  // k_{1, 2, 3}, p_{1, 2} follows the same definition as OpenCV.
  // https://en.wikipedia.org/wiki/Distortion_(optics)
  // https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
  repeated double intrinsic = 2;
  // Camera frame to vehicle frame.
  optional Transform extrinsic = 3;
  // Camera image size.
  optional int32 width = 4;
  optional int32 height = 5;

  enum RollingShutterReadOutDirection {
    UNKNOWN = 0;
    TOP_TO_BOTTOM = 1;
    LEFT_TO_RIGHT = 2;
    BOTTOM_TO_TOP = 3;
    RIGHT_TO_LEFT = 4;
    GLOBAL_SHUTTER = 5;
  }
  optional RollingShutterReadOutDirection rolling_shutter_direction = 6;


}

message LaserCalibration {
  optional LaserName.Name name = 1;
  // If non-empty, the beam pitch (in radians) is non-uniform. When constructing
  // a range image, this mapping is used to map from beam pitch to range image
  // row.  If this is empty, we assume a uniform distribution.
  repeated double beam_inclinations = 2;
  // beam_inclination_{min,max} (in radians) are used to determine the mapping.
  optional double beam_inclination_min = 3;
  optional double beam_inclination_max = 4;
  // Lidar frame to vehicle frame.
  optional Transform extrinsic = 5;


}

message Context {
  // A unique name that identifies the frame sequence.
  optional string name = 1;
  repeated CameraCalibration camera_calibrations = 2;
  repeated LaserCalibration laser_calibrations = 3;
  // Some stats for the run segment used.
  message Stats {
    message ObjectCount {
      optional Label.Type type = 1;
      // The number of unique objects with the type in the segment.
      optional int32 count = 2;
    }
    repeated ObjectCount laser_object_counts = 1;
    repeated ObjectCount camera_object_counts = 5;
    // Day, Dawn/Dusk, or Night, determined from sun elevation.
    optional string time_of_day = 2;
    // Human readable location (e.g. CHD, SF) of the run segment.
    optional string location = 3;
    // Currently either Sunny or Rain.
    optional string weather = 4;
  }
  optional Stats stats = 4;
}

// Range image is a 2d tensor. The first dim (row) represents pitch. The second
// dim represents yaw.
// There are two types of range images:
// 1. Raw range image: Raw range image with a non-empty
//   'range_image_pose_compressed' which tells the vehicle pose of each
//   range image cell.
// 2. Virtual range image: Range image with an empty
//   'range_image_pose_compressed'. This range image is constructed by
//   transforming all lidar points into a fixed vehicle frame (usually the
//   vehicle frame of the middle scan).
// NOTE: 'range_image_pose_compressed' is only populated for the first range
// image return. The second return has the exact the same range image pose as
// the first one.
message RangeImage {
  // Zlib compressed [H, W, 4] serialized version of MatrixFloat.
  // To decompress:
  // string val = ZlibDecompress(range_image_compressed);
  // MatrixFloat range_image;
  // range_image.ParseFromString(val);
  // Inner dimensions are:
  //   * channel 0: range
  //   * channel 1: intensity
  //   * channel 2: elongation
  //   * channel 3: is in any no label zone.
  optional bytes range_image_compressed = 2;

  // Lidar point to camera image projections. A point can be projected to
  // multiple camera images. We pick the first two at the following order:
  // [FRONT, FRONT_LEFT, FRONT_RIGHT, SIDE_LEFT, SIDE_RIGHT].
  //
  // Zlib compressed [H, W, 6] serialized version of MatrixInt32.
  // To decompress:
  // string val = ZlibDecompress(camera_projection_compressed);
  // MatrixInt32 camera_projection;
  // camera_projection.ParseFromString(val);
  // Inner dimensions are:
  //   * channel 0: CameraName.Name of 1st projection. Set to UNKNOWN if no
  //       projection.
  //   * channel 1: x (axis along image width)
  //   * channel 2: y (axis along image height)
  //   * channel 3: CameraName.Name of 2nd projection. Set to UNKNOWN if no
  //       projection.
  //   * channel 4: x (axis along image width)
  //   * channel 5: y (axis along image height)
  // Note: pixel 0 corresponds to the left edge of the first pixel in the image.
  optional bytes camera_projection_compressed = 3;

  // Zlib compressed [H, W, 6] serialized version of MatrixFloat.
  // To decompress:
  // string val = ZlibDecompress(range_image_pose_compressed);
  // MatrixFloat range_image_pose;
  // range_image_pose.ParseFromString(val);
  // Inner dimensions are [roll, pitch, yaw, x, y, z] represents a transform
  // from vehicle frame to global frame for every range image pixel.
  // This is ONLY populated for the first return. The second return is assumed
  // to have exactly the same range_image_pose_compressed.
  //
  // The roll, pitch and yaw are specified as 3-2-1 Euler angle rotations,
  // meaning that rotating from the navigation to vehicle frame consists of a
  // yaw, then pitch and finally roll rotation about the z, y and x axes
  // respectively. All rotations use the right hand rule and are positive
  // in the counter clockwise direction.
  optional bytes range_image_pose_compressed = 4;

  // Zlib compressed [H, W, 5] serialized version of MatrixFloat.
  // To decompress:
  // string val = ZlibDecompress(range_image_flow_compressed);
  // MatrixFloat range_image_flow;
  // range_image_flow.ParseFromString(val);
  // Inner dimensions are [vx, vy, vz, pointwise class].
  //
  // If the point is not annotated with scene flow information, class is set
  // to -1. A point is not annotated if it is in a no-label zone or if its label
  // bounding box does not have a corresponding match in the previous frame,
  // making it infeasible to estimate the motion of the point.
  // Otherwise, (vx, vy, vz) are velocity along (x, y, z)-axis for this point
  // and class is set to one of the following values:
  //  -1: no-flow-label, the point has no flow information.
  //   0:  unlabeled or "background,", i.e., the point is not contained in a
  //       bounding box.
  //   1: vehicle, i.e., the point corresponds to a vehicle label box.
  //   2: pedestrian, i.e., the point corresponds to a pedestrian label box.
  //   3: sign, i.e., the point corresponds to a sign label box.
  //   4: cyclist, i.e., the point corresponds to a cyclist label box.
  optional bytes range_image_flow_compressed = 5;

  // Zlib compressed [H, W, 2] serialized version of MatrixInt32.
  // To decompress:
  // string val = ZlibDecompress(segmentation_label_compressed);
  // MatrixInt32 segmentation_label.
  // segmentation_label.ParseFromString(val);
  // Inner dimensions are [instance_id, semantic_class].
  //
  // NOTE:
  // 1. Only TOP LiDAR has segmentation labels.
  // 2. Not every frame has segmentation labels. This field is not set if a
  //    frame is not labeled.
  // 3. There can be points missing segmentation labels within a labeled frame.
  //    Their label are set to TYPE_NOT_LABELED when that happens.
  optional bytes segmentation_label_compressed = 6;

  // Deprecated, do not use.
  optional MatrixFloat range_image = 1 [deprecated = true];
}

// Panoptic (instance + semantic) segmentation labels for a given camera image.
// Associations can also be provided between each instance ID and a globally
// unique ID across all frames.
message CameraSegmentationLabel {
  // The value used to separate instance_ids from different semantic classes.
  // See the panoptic_label field for how this is used. Must be set to be
  // greater than the maximum instance_id.
  optional int32 panoptic_label_divisor = 1;
  // A uint16 png encoded image, with the same resolution as the corresponding
  // camera image. Each pixel contains a panoptic segmentation label, which is
  // computed as:
  // semantic_class_id * panoptic_label_divisor + instance_id.
  // We set instance_id = 0 for pixels for which there is no instance_id.
  // NOTE: Instance IDs in this label are only consistent within this camera
  // image. Use instance_id_to_global_id_mapping to get cross-camera consistent
  // instance IDs.
  optional bytes panoptic_label = 2;
  // A mapping between each panoptic label with an instance_id and a globally
  // unique ID across all frames within the same sequence. This can be used to
  // match instances across cameras and over time. i.e. instances belonging to
  // the same object will map to the same global ID across all frames in the
  // same sequence.
  // NOTE: These unique IDs are not consistent with other IDs in the dataset,
  // e.g. the bounding box IDs.
  message InstanceIDToGlobalIDMapping {
    optional int32 local_instance_id = 1;
    optional int32 global_instance_id = 2;
    // If false, the corresponding instance will not have consistent global ids
    // between frames.
    optional bool is_tracked = 3;
  }
  repeated InstanceIDToGlobalIDMapping instance_id_to_global_id_mapping = 3;
  // The sequence id for this label. The above instance_id_to_global_id_mapping
  // is only valid with other labels with the same sequence id.
  optional string sequence_id = 4;

  // A uint8 png encoded image, with the same resolution as the corresponding
  // camera image. The value on each pixel indicates the number of cameras that
  // overlap with this pixel. Used for the weighted Segmentation and Tracking
  // Quality (wSTQ) metric.
  optional bytes num_cameras_covered = 5;
}

// All timestamps in this proto are represented as seconds since Unix epoch.
message CameraImage {
  optional CameraName.Name name = 1;
  // JPEG image.
  optional bytes image = 2;
  // SDC pose.
  optional Transform pose = 3;
  // SDC velocity at 'pose_timestamp' below. The velocity value is represented
  // at *global* frame.
  // With this velocity, the pose can be extrapolated.
  // r(t+dt) = r(t) + dr/dt * dt where dr/dt = v_{x,y,z}.
  // dR(t)/dt = W*R(t) where W = SkewSymmetric(w_{x,y,z})
  // This differential equation solves to: R(t) = exp(Wt)*R(0) if W is constant.
  // When dt is small: R(t+dt) = (I+W*dt)R(t)
  // r(t) = (x(t), y(t), z(t)) is vehicle location at t in the global frame.
  // R(t) = Rotation Matrix (3x3) from the body frame to the global frame at t.
  // SkewSymmetric(x,y,z) is defined as the cross-product matrix in the
  // following:
  // https://en.wikipedia.org/wiki/Cross_product#Conversion_to_matrix_multiplication
  optional Velocity velocity = 4;
  // Timestamp of the `pose` above.
  optional double pose_timestamp = 5;

  // Rolling shutter params.
  // The following explanation assumes left->right rolling shutter.
  //
  // Rolling shutter cameras expose and read the image column by column, offset
  // by the read out time for each column. The desired timestamp for each column
  // is the middle of the exposure of that column as outlined below for an image
  // with 3 columns:
  // ------time------>
  // |---- exposure col 1----| read |
  // -------|---- exposure col 2----| read |
  // --------------|---- exposure col 3----| read |
  // ^trigger time                                ^readout end time
  //             ^time for row 1 (= middle of exposure of row 1)
  //                    ^time image center (= middle of exposure of middle row)
  // Shutter duration in seconds. Exposure time per column.
  optional double shutter = 6;
  // Time when the sensor was triggered and when last readout finished.
  // The difference between trigger time and readout done time includes
  // the exposure time and the actual sensor readout time.
  optional double camera_trigger_time = 7;
  optional double camera_readout_done_time = 8;

  // Panoptic segmentation labels for this camera image.
  // NOTE: Not every image has panoptic segmentation labels.
  optional CameraSegmentationLabel camera_segmentation_label = 10;


}

// The camera labels associated with a given camera image. This message
// indicates the ground truth information for the camera image
// recorded by the given camera. If there are no labeled objects in the image,
// then the labels field is empty.
message CameraLabels {
  optional CameraName.Name name = 1;
  repeated Label labels = 2;
}

message Laser {
  optional LaserName.Name name = 1;
  optional RangeImage ri_return1 = 2;
  optional RangeImage ri_return2 = 3;
}

message Frame {
  // The following field numbers are reserved for third-party extensions. Users
  // may declare new fields in that range in their own .proto files without
  // having to edit the original file.
  extensions 1000 to max;

  // This context is the same for all frames belong to the same driving run
  // segment. Use context.name to identify frames belong to the same driving
  // segment. We do not store all frames from one driving segment in one proto
  // to avoid huge protos.
  optional Context context = 1;

  // Frame start time, which is the timestamp of the first top LiDAR scan
  // within this frame. Note that this timestamp does not correspond to the
  // provided vehicle pose (pose).
  optional int64 timestamp_micros = 2;

  // Frame vehicle pose. Note that unlike in CameraImage, the Frame pose does
  // not correspond to the provided timestamp (timestamp_micros). Instead, it
  // roughly (but not exactly) corresponds to the vehicle pose in the middle of
  // the given frame. The frame vehicle pose defines the coordinate system which
  // the 3D laser labels are defined in.
  optional Transform pose = 3;

  // The camera images.
  repeated CameraImage images = 4;

  // The LiDAR sensor data.
  repeated Laser lasers = 5;

  // Native 3D labels that correspond to the LiDAR sensor data. The 3D labels
  // are defined w.r.t. the frame vehicle pose coordinate system (pose).
  repeated Label laser_labels = 6;

  // The native 3D LiDAR labels (laser_labels) projected to camera images. A
  // projected label is the smallest image axis aligned rectangle that can cover
  // all projected points from the 3d LiDAR label. The projected label is
  // ignored if the projection is fully outside a camera image. The projected
  // label is clamped to the camera image if it is partially outside.
  repeated CameraLabels projected_lidar_labels = 9;

  // Native 2D camera labels. Note that if a camera identified by
  // CameraLabels.name has an entry in this field, then it has been labeled,
  // even though it is possible that there are no labeled objects in the
  // corresponding image, which is identified by a zero sized
  // CameraLabels.labels.
  repeated CameraLabels camera_labels = 8;

  // No label zones in the *global* frame.
  repeated Polygon2dProto no_label_zones = 7;

  // Map features. Only the first frame in a segment will contain map data. This
  // field will be empty for other frames as the map is identical for all
  // frames.
  repeated MapFeature map_features = 10;

  // Map pose offset. This offset must be added to lidar points from this frame
  // to compensate for pose drift and align with the map features.
  optional Vector3d map_pose_offset = 11;
}