Swift Image preprocessing: normalization with mean [0.485, 0.456, 0.405] std [0.229, 0.224, 0.225]

18 Views Asked by At

I know there is Core ML Image preprocessing options for this. But I want to know how to do image normalization myself in Swift for checking the result I made should be the same as the result that Core ML model with Image preprocessing does.

Core ML Image preprocessing options, I set bias and scale as the suggestion for PyTorch: scale = 1/(0.226*255.0) bias = [- 0.485/(0.229) , - 0.456/(0.224), - 0.406/(0.225)]

When Core ML model without the options, I try to normalize the image using below methods but no luck.

  1. UIImage convert to CVPixelBuffer -> PixelBuffer standardization -> model.predict:
let mean: [Float] = [0.406, 0.456, 0.485] // in "BGR"
let std: [Float] = [0.225, 0.224, 0.229]

func normalizePixelBuffer(_ pixelBuffer: CVPixelBuffer) -> CVPixelBuffer? {
    let width = CVPixelBufferGetWidth(pixelBuffer)
    let height = CVPixelBufferGetHeight(pixelBuffer)
    
    // Create a new pixel buffer for normalized data
    var normalizedPixelBuffer: CVPixelBuffer?
    CVPixelBufferCreate(nil, width, height, kCVPixelFormatType_32BGRA, nil, &normalizedPixelBuffer)
    
    guard let normalizedBuffer = normalizedPixelBuffer else {
        return nil
    }
    
    CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly)
    CVPixelBufferLockBaseAddress(normalizedBuffer, [])
    
    // Get pointers to the pixel buffers
    let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer)
    let normalizedBaseAddress = CVPixelBufferGetBaseAddress(normalizedBuffer)
    
    // Normalize the pixel values
    let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer)
    let normalizedBytesPerRow = CVPixelBufferGetBytesPerRow(normalizedBuffer)
    
    for y in 0..<height {
        for x in 0..<width {
            let pixelOffset = y * bytesPerRow + x * 4
            let normalizedPixelOffset = y * normalizedBytesPerRow + x * 4
            
            // Access the pixel values
            let pixel = baseAddress!.advanced(by: pixelOffset).assumingMemoryBound(to: UInt8.self)
            let normalizedPixel = normalizedBaseAddress!.advanced(by: normalizedPixelOffset).assumingMemoryBound(to: Float.self)
            
            // Perform normalization (assuming 8-bit pixel values)
            normalizedPixel[0] = ((Float(pixel[0])/255 - mean[0]) / std[0])
            normalizedPixel[1] = ((Float(pixel[1])/255 - mean[1]) / std[1])
            normalizedPixel[2] = ((Float(pixel[2])/255 - mean[2]) / std[2])
            normalizedPixel[3] = Float(pixel[3]) // Assuming alpha channel is not normalized
        }
    }
    
    CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly)
    CVPixelBufferUnlockBaseAddress(normalizedBuffer, [])
    
    return normalizedBuffer
}
  1. "UIImage normalization" -> converting to CVPixelBuffer -> model.predict
extension UIImage {
func normalize() -> UIImage? {
      let colorSpace = CGColorSpaceCreateDeviceRGB()

      guard let cgImage = cgImage else {
          return nil
      }

      let width = cgImage.width
      let height = cgImage.height

      var rawData = [UInt8](repeating: 0, count: width * height * 4)
      let bytesPerPixel = 4
      let bytesPerRow = bytesPerPixel * width
      let bitsPerComponent = 8

      let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue

      guard let context = CGContext(data: &rawData,
                                    width: width,
                                    height: height,
                                    bitsPerComponent: bitsPerComponent,
                                    bytesPerRow: bytesPerRow,
                                    space: colorSpace,
                                    bitmapInfo: bitmapInfo) else { return nil }

      let drawingRect = CGRect(origin: .zero, size: CGSize(width: width, height: height))
      context.draw(cgImage, in: drawingRect)

      var maxValue: UInt8 = 0
      var minValue: UInt8 = 255

      for pixel in 0 ..< width * height {
          let baseOffset = pixel * 4
          for offset in baseOffset ..< baseOffset + 3 {
              let value = rawData[offset]
              if value > maxValue { maxValue = value }
              if value < minValue { minValue = value }
          }
      }
      let range = Float(maxValue - minValue)
      guard range > 0 else { return nil }

      for pixel in 0 ..< width * height {
          let baseOffset = pixel * 4
        rawData[baseOffset] = max(0.0, min(1.0, (((Float(rawData[baseOffset])/255) - 0.485) / 0.229))
        rawData[baseOffset+1] = max(0.0, min(1.0, (((Float(rawData[baseOffset+1])/255) - 0.456) / 0.224))
        rawData[baseOffset+2] = max(0.0, min(1.0, (((Float(rawData[baseOffset+2])/255) - 0.405) / 0.225))
      }

      return context.makeImage().map { UIImage(cgImage: $0, scale: scale, orientation: imageOrientation) }
  }
}

The 1st gets the wrong result, and the 2nd gets no effect as not do normalization. I think Core ML model predict in Swift with CVPixelBuffer each channle in [0, 255]? Is there anything wrong, or something I forget?

0

There are 0 best solutions below