SwiftUI – How can I recognize words and get positions in Vision

March 17, 2024

The code below can just recognize for words, but I don’t know how to get the positions.
I can get the texts but not boundingboxes.

From: https://medium.com/@jakir/text-recognition-or-ocr-using-vision-framework-ios-swiftui-b9c5df36ec32

import SwiftUI
import Vision

struct ContentView: View {
    
    @State var recognizedText = ""
    
    var body: some View {
        VStack {
            Text("OCR using Vission")
                .font(.title)
            
            Image("quote")
                .resizable()
                .scaledToFit()
            
            Button("Recognize Text"){
                ocr()
            }
            
            TextEditor(text: $recognizedText)
        }
        .padding()
        
    }
    
    func ocr() {
        let image = UIImage(named: "quote")
        
        if let cgImage = image?.cgImage {
            
            // Request handler
            let handler = VNImageRequestHandler(cgImage: cgImage)
            
            let recognizeRequest = VNRecognizeTextRequest { (request, error) in
                
                // Parse the results as text
                guard let result = request.results as? [VNRecognizedTextObservation] else {
                    return
                }
                
                // Extract the data
                let stringArray = result.compactMap { result in
                    result.topCandidates(1).first?.string
                }
                
                // Update the UI
                DispatchQueue.main.async {
                    recognizedText = stringArray.joined(separator: "\n")
                }
            }
            
            // Process the request
            recognizeRequest.recognitionLevel = .accurate
            do {
                try handler.perform([recognizeRequest])
            } catch {
                print(error)
            }
            
        }
    }
}

I visited many websites but found no results

>Solution :

This answer tells how to recognize for sentences boundingboxes but not words
Extracting Word-Level BoundingBoxes with VNRecognizeTextRequest's .accurate in Vision Framework – SwiftUI

func recognizeText(image: UIImage, completion: @escaping([String], [CGRect]) -> Void) {
    var texts: [String] = []
    var positions: [CGRect] = []
    
    guard let cgImage = image.cgImage else { return }
    let request = VNRecognizeTextRequest { (request, error) in
      guard let observations = request.results as? [VNRecognizedTextObservation], error == nil else {
        print("Text recognition error: \(error?.localizedDescription ?? "Unknown error")")
        return
      }
      for observation in observations {
        guard let topCandidate = observation.topCandidates(1).first else { continue }
        texts.append(topCandidate.string)
        positions.append(observation.boundingBox)
      }
      DispatchQueue.main.async {
        print(texts)
        print(positions)
        completion(texts, positions)
      }
    }
    request.recognitionLevel = .accurate
    
    let handler = VNImageRequestHandler(cgImage: cgImage)
    try? handler.perform([request])
  }