谁知道如何使用Apple的视觉框架进行实时文本识别?
Anyone know how to use Apple's vision framework for real-time text recognition?
我似乎找不到不使用文档扫描仪的方法,而是用 AVFoundation 来补充它。我正在尝试创建一个功能,用户可以在其中单击按钮、扫描文本,然后将其保存到某个文本视图 w/o 让用户单击相机按钮、继续扫描、保存等。
我已经将它用于对象检测,但我无法将它用于文本识别。那么,有没有办法利用苹果的视觉框架进行实时文字识别呢?任何帮助将不胜感激
出于性能原因,我不希望将 CMSampleBuffer
转换为 UIImage
,而是使用以下内容为实时视频创建 AVCaptureVideoPreviewLayer
:
class CameraFeedView: UIView {
private var previewLayer: AVCaptureVideoPreviewLayer!
override class var layerClass: AnyClass {
return AVCaptureVideoPreviewLayer.self
}
init(frame: CGRect, session: AVCaptureSession, videoOrientation: AVCaptureVideoOrientation) {
super.init(frame: frame)
previewLayer = layer as? AVCaptureVideoPreviewLayer
previewLayer.session = session
previewLayer.videoGravity = .resizeAspect
previewLayer.connection?.videoOrientation = videoOrientation
}
required init?(coder: NSCoder) {
fatalError("init(coder:) has not been implemented")
}
}
有了这个之后,您可以使用 Vision
:
处理实时视频数据
class CameraViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
private let videoDataOutputQueue = DispatchQueue(label: "CameraFeedDataOutput", qos: .userInitiated,
attributes: [], autoreleaseFrequency: .workItem)
private var drawingView: UILabel = {
let view = UILabel(frame: UIScreen.main.bounds)
view.font = UIFont.boldSystemFont(ofSize: 30.0)
view.textColor = .red
view.translatesAutoresizingMaskIntoConstraints = false
return view
}()
private var cameraFeedSession: AVCaptureSession?
private var cameraFeedView: CameraFeedView! //Wrap
override func viewDidLoad() {
super.viewDidLoad()
do {
try setupAVSession()
} catch {
print("setup av session failed")
}
}
func setupAVSession() throws {
// Create device discovery session for a wide angle camera
let wideAngle = AVCaptureDevice.DeviceType.builtInWideAngleCamera
let discoverySession = AVCaptureDevice.DiscoverySession(deviceTypes: [wideAngle], mediaType: .video, position: .back)
// Select a video device, make an input
guard let videoDevice = discoverySession.devices.first else {
print("Could not find a wide angle camera device.")
}
guard let deviceInput = try? AVCaptureDeviceInput(device: videoDevice) else {
print("Could not create video device input.")
}
let session = AVCaptureSession()
session.beginConfiguration()
// We prefer a 1080p video capture but if camera cannot provide it then fall back to highest possible quality
if videoDevice.supportsSessionPreset(.hd1920x1080) {
session.sessionPreset = .hd1920x1080
} else {
session.sessionPreset = .high
}
// Add a video input
guard session.canAddInput(deviceInput) else {
print("Could not add video device input to the session")
}
session.addInput(deviceInput)
let dataOutput = AVCaptureVideoDataOutput()
if session.canAddOutput(dataOutput) {
session.addOutput(dataOutput)
// Add a video data output
dataOutput.alwaysDiscardsLateVideoFrames = true
dataOutput.videoSettings = [
String(kCVPixelBufferPixelFormatTypeKey): Int(kCVPixelFormatType_420YpCbCr8BiPlanarFullRange)
]
dataOutput.setSampleBufferDelegate(self, queue: videoDataOutputQueue)
} else {
print("Could not add video data output to the session")
}
let captureConnection = dataOutput.connection(with: .video)
captureConnection?.preferredVideoStabilizationMode = .standard
captureConnection?.videoOrientation = .portrait
// Always process the frames
captureConnection?.isEnabled = true
session.commitConfiguration()
cameraFeedSession = session
// Get the interface orientaion from window scene to set proper video orientation on capture connection.
let videoOrientation: AVCaptureVideoOrientation
switch view.window?.windowScene?.interfaceOrientation {
case .landscapeRight:
videoOrientation = .landscapeRight
default:
videoOrientation = .portrait
}
// Create and setup video feed view
cameraFeedView = CameraFeedView(frame: view.bounds, session: session, videoOrientation: videoOrientation)
setupVideoOutputView(cameraFeedView)
cameraFeedSession?.startRunning()
}
设置 AVCaptureSession
后要实施的关键功能是委托和请求处理程序:
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
let requestHandler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .down)
let request = VNRecognizeTextRequest(completionHandler: textDetectHandler)
do {
// Perform the text-detection request.
try requestHandler.perform([request])
} catch {
print("Unable to perform the request: \(error).")
}
}
func textDetectHandler(request: VNRequest, error: Error?) {
guard let observations =
request.results as? [VNRecognizedTextObservation] else { return }
// Process each observation to find the recognized body pose points.
let recognizedStrings = observations.compactMap { observation in
// Return the string of the top VNRecognizedText instance.
return observation.topCandidates(1).first?.string
}
DispatchQueue.main.async {
self.drawingView.text = recognizedStrings.first
}
}
}
请注意,您可能希望处理每个 recognizedStrings
以选择置信度最高的一个,但这是概念证明。您还可以添加一个边界框,docs 有一个例子。
我似乎找不到不使用文档扫描仪的方法,而是用 AVFoundation 来补充它。我正在尝试创建一个功能,用户可以在其中单击按钮、扫描文本,然后将其保存到某个文本视图 w/o 让用户单击相机按钮、继续扫描、保存等。
我已经将它用于对象检测,但我无法将它用于文本识别。那么,有没有办法利用苹果的视觉框架进行实时文字识别呢?任何帮助将不胜感激
出于性能原因,我不希望将 CMSampleBuffer
转换为 UIImage
,而是使用以下内容为实时视频创建 AVCaptureVideoPreviewLayer
:
class CameraFeedView: UIView {
private var previewLayer: AVCaptureVideoPreviewLayer!
override class var layerClass: AnyClass {
return AVCaptureVideoPreviewLayer.self
}
init(frame: CGRect, session: AVCaptureSession, videoOrientation: AVCaptureVideoOrientation) {
super.init(frame: frame)
previewLayer = layer as? AVCaptureVideoPreviewLayer
previewLayer.session = session
previewLayer.videoGravity = .resizeAspect
previewLayer.connection?.videoOrientation = videoOrientation
}
required init?(coder: NSCoder) {
fatalError("init(coder:) has not been implemented")
}
}
有了这个之后,您可以使用 Vision
:
class CameraViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
private let videoDataOutputQueue = DispatchQueue(label: "CameraFeedDataOutput", qos: .userInitiated,
attributes: [], autoreleaseFrequency: .workItem)
private var drawingView: UILabel = {
let view = UILabel(frame: UIScreen.main.bounds)
view.font = UIFont.boldSystemFont(ofSize: 30.0)
view.textColor = .red
view.translatesAutoresizingMaskIntoConstraints = false
return view
}()
private var cameraFeedSession: AVCaptureSession?
private var cameraFeedView: CameraFeedView! //Wrap
override func viewDidLoad() {
super.viewDidLoad()
do {
try setupAVSession()
} catch {
print("setup av session failed")
}
}
func setupAVSession() throws {
// Create device discovery session for a wide angle camera
let wideAngle = AVCaptureDevice.DeviceType.builtInWideAngleCamera
let discoverySession = AVCaptureDevice.DiscoverySession(deviceTypes: [wideAngle], mediaType: .video, position: .back)
// Select a video device, make an input
guard let videoDevice = discoverySession.devices.first else {
print("Could not find a wide angle camera device.")
}
guard let deviceInput = try? AVCaptureDeviceInput(device: videoDevice) else {
print("Could not create video device input.")
}
let session = AVCaptureSession()
session.beginConfiguration()
// We prefer a 1080p video capture but if camera cannot provide it then fall back to highest possible quality
if videoDevice.supportsSessionPreset(.hd1920x1080) {
session.sessionPreset = .hd1920x1080
} else {
session.sessionPreset = .high
}
// Add a video input
guard session.canAddInput(deviceInput) else {
print("Could not add video device input to the session")
}
session.addInput(deviceInput)
let dataOutput = AVCaptureVideoDataOutput()
if session.canAddOutput(dataOutput) {
session.addOutput(dataOutput)
// Add a video data output
dataOutput.alwaysDiscardsLateVideoFrames = true
dataOutput.videoSettings = [
String(kCVPixelBufferPixelFormatTypeKey): Int(kCVPixelFormatType_420YpCbCr8BiPlanarFullRange)
]
dataOutput.setSampleBufferDelegate(self, queue: videoDataOutputQueue)
} else {
print("Could not add video data output to the session")
}
let captureConnection = dataOutput.connection(with: .video)
captureConnection?.preferredVideoStabilizationMode = .standard
captureConnection?.videoOrientation = .portrait
// Always process the frames
captureConnection?.isEnabled = true
session.commitConfiguration()
cameraFeedSession = session
// Get the interface orientaion from window scene to set proper video orientation on capture connection.
let videoOrientation: AVCaptureVideoOrientation
switch view.window?.windowScene?.interfaceOrientation {
case .landscapeRight:
videoOrientation = .landscapeRight
default:
videoOrientation = .portrait
}
// Create and setup video feed view
cameraFeedView = CameraFeedView(frame: view.bounds, session: session, videoOrientation: videoOrientation)
setupVideoOutputView(cameraFeedView)
cameraFeedSession?.startRunning()
}
设置 AVCaptureSession
后要实施的关键功能是委托和请求处理程序:
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
let requestHandler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .down)
let request = VNRecognizeTextRequest(completionHandler: textDetectHandler)
do {
// Perform the text-detection request.
try requestHandler.perform([request])
} catch {
print("Unable to perform the request: \(error).")
}
}
func textDetectHandler(request: VNRequest, error: Error?) {
guard let observations =
request.results as? [VNRecognizedTextObservation] else { return }
// Process each observation to find the recognized body pose points.
let recognizedStrings = observations.compactMap { observation in
// Return the string of the top VNRecognizedText instance.
return observation.topCandidates(1).first?.string
}
DispatchQueue.main.async {
self.drawingView.text = recognizedStrings.first
}
}
}
请注意,您可能希望处理每个 recognizedStrings
以选择置信度最高的一个,但这是概念证明。您还可以添加一个边界框,docs 有一个例子。