PaddleOCR-ONNXで100枚OCRした時の処理速度を測ってみた！

WoLNamesBlackedOut、次はOCRしたくて

いろいろOCR試してたのですが、PaddleOCRが良さそうなので、先日モデルをONNXにエクスポートしました。

で、これモデルが軽快でCPU推論でもサっと結果が出てくるので、もしかするとGPUメモリ行き来が少ないCPU推論でもいいんじゃね？
でもやっぱりGPU（DirectML）でのバッチ推論の方が早かったりする？

というのが気になって、ベンチマークしてみました！

ベンチマーク！

import cv2
import numpy as np
import onnxruntime as ort
import os
import time

# =====================================================================
# 1. 確定：GitHubリポジトリ内の公式英語辞書ファイルから文字マップを自動生成
# =====================================================================
dict_path = "./ppocr/utils/dict/ppocrv5_en_dict.txt"

if not os.path.exists(dict_path):
    raise FileNotFoundError(
        f"公式辞書ファイルが見つかりません。現在の実行フォルダ（D:\\PaddleOCR）に "
        f"'{dict_path}' が存在するか確認してください。"
    )

character_list = ["blank"]
with open(dict_path, "r", encoding="utf-8") as f:
    for line in f:
        char = line.strip("\r\n")
        character_list.append(char)

if " " not in character_list:
    character_list.append(" ")

print(f"公式辞書の読み込みに成功しました。総文字数: {len(character_list)}")


# =====================================================================
# 2. 公式と100%一致させた前処理 (リサイズ ＋ ゼロパディング)
# =====================================================================
def preprocess_image(img_path, target_shape=(3, 48, 320)):
    img = cv2.imread(img_path)
    if img is None:
        raise FileNotFoundError(f"画像が見つかりません: {img_path}")
    
    rec_c, rec_h, rec_w = target_shape
    img_h, img_w, _ = img.shape
    
    ratio = img_w / float(img_h)
    resized_w = int(np.ceil(rec_h * ratio))
    resized_w = min(resized_w, rec_w)
        
    resized_img = cv2.resize(img, (resized_w, rec_h), interpolation=cv2.INTER_LINEAR)
    
    norm_img = resized_img.astype(np.float32) / 127.5 - 1.0
    norm_img = norm_img.transpose((2, 0, 1))
    
    padding_img = np.zeros((rec_c, rec_h, rec_w), dtype=np.float32)
    padding_img[:, :, :resized_w] = norm_img
    
    return np.expand_dims(padding_img, axis=0)

# =====================================================================
# 3. 修正：スペースを消さない CTC Greedy デコーダー
# =====================================================================
def ctc_decode(preds_single, character_list):
    if len(preds_single.shape) == 3:
        preds_single = preds_single
        
    preds_idx = np.argmax(preds_single, axis=-1)
    preds_prob = np.max(preds_single, axis=-1)
    
    char_res = []
    conf_res = []
    last_idx = -1 
    
    for idx, prob in zip(preds_idx, preds_prob):
        if idx == 0:
            last_idx = idx
            continue
        if idx == last_idx:
            continue
            
        if idx < len(character_list):
            char = character_list[idx]
            char_res.append(char)
            conf_res.append(prob)
            
        last_idx = idx
        
    text = "".join(char_res)
    score = np.mean(conf_res) if conf_res else 0.0
    return text, score

# =====================================================================
# 4. ベンチマーク実行関数 (100回 / 100バッチ対応)
# =====================================================================
def run_benchmark(name, providers, model_path, single_tensor, batch_tensor):
    print(f"[{name}] 計測中...")
    try:
        session = ort.InferenceSession(model_path, providers=providers)
    except Exception as e:
        print(f" -> エラー (利用不可): {e}")
        return None
        
    # 【修正】[0].name を指定してリストから正確に入出力名を取り出します
    input_name = session.get_inputs()[0].name
    output_name = session.get_outputs()[0].name
    
    # -----------------------------------------------------------------
    # A. 1つずつ100回推論
    # -----------------------------------------------------------------
    _ = session.run([output_name], {input_name: single_tensor})  # ウォームアップ
    
    start_time = time.perf_counter()
    for _ in range(100):
        _ = session.run([output_name], {input_name: single_tensor})
    end_time = time.perf_counter()
    
    seq_total = (end_time - start_time) * 1000  # 100回合計（ミリ秒）
    seq_per_img = seq_total / 100               # 1枚あたり（ミリ秒）
    
    # -----------------------------------------------------------------
    # B. 100バッチ1回推論
    # -----------------------------------------------------------------
    _ = session.run([output_name], {input_name: batch_tensor})   # ウォームアップ
    
    start_time = time.perf_counter()
    _ = session.run([output_name], {input_name: batch_tensor})
    end_time = time.perf_counter()
    
    batch_total = (end_time - start_time) * 1000 # 100枚一括合計（ミリ秒）
    batch_per_img = batch_total / 100            # 1枚あたり換算（ミリ秒）
    
    return {
        "seq_per_img": seq_per_img,
        "seq_total": seq_total,
        "batch_per_img": batch_per_img,
        "batch_total": batch_total
    }

# =====================================================================
# 5. メイン処理
# =====================================================================
if __name__ == "__main__":
    if os.path.exists("./c_ppocr-v5-rec_sim.onnx"):
        model_path = "./c_ppocr-v5-rec_sim.onnx"
    elif os.path.exists("./ppocr-v5-rec.onnx"):
        model_path = "./ppocr-v5-rec.onnx"
    else:
        model_path = "./ppocr-v3-rec.onnx"
        
    print(f"使用するモデルファイル: {model_path}")
    
    image_path = "./test_name_chiyomaru.png"
    if not os.path.exists(image_path):
        image_path = "./test_name_CalocenRieti.png"
    
    single_tensor = preprocess_image(image_path)
    # 1枚の画像を100枚分のバッチデータに複製
    batch_tensor = np.repeat(single_tensor, 100, axis=0)
    
    results = {}
    print("\nベンチマークを開始します...")
    
    # 1. CPU
    results["CPU"] = run_benchmark(
        "CPU", ['CPUExecutionProvider'], model_path, single_tensor, batch_tensor
    )
    
    # 2. DirectML
    results["DirectML"] = run_benchmark(
        "DirectML", ['DmlExecutionProvider', 'CPUExecutionProvider'], model_path, single_tensor, batch_tensor
    )
    
    # 最終出力フォーマットの整形
    print("\n" + "="*60)
    print("                    推論速度ベンチマーク結果 (100枚)")
    print("="*60)
    
    for dev in ["CPU", "DirectML"]:
        res = results.get(dev)
        print(f"【 {dev} 】")
        if res is not None:
            print(f"  ・1つずつ100回推論 : 画像1枚あたり {res['seq_per_img']:6.2f} ms | 100枚での合計時間 {res['seq_total']:6.2f} ms")
            print(f"  ・100バッチ1回推論 : 画像1枚あたり {res['batch_per_img']:6.2f} ms | 100枚での合計時間 {res['batch_total']:6.2f} ms")
        else:
            print("  利用不可、またはエラーが発生しました。")
        print("-"*60)

結果！

ONNXファイルは、シンプル化済み、入出力以外はFP16済み。

(venv) PS D:\PaddleOCR> python ppocrv5_onnx_infer.py
公式辞書の読み込みに成功しました。総文字数: 438
使用するモデルファイル: ./c_ppocr-v5-rec_sim.onnx

ベンチマークを開始します...
[CPU] 計測中...
[DirectML] 計測中...

============================================================
                    推論速度ベンチマーク結果 (100枚)
============================================================
【 CPU 】
  ・1つずつ100回推論 : 画像1枚あたり   8.33 ms | 100枚での合計時間 832.50 ms
  ・100バッチ1回推論 : 画像1枚あたり  17.70 ms | 100枚での合計時間 1770.25 ms
------------------------------------------------------------
【 DirectML 】
  ・1つずつ100回推論 : 画像1枚あたり   4.41 ms | 100枚での合計時間 440.79 ms
  ・100バッチ1回推論 : 画像1枚あたり   0.99 ms | 100枚での合計時間  99.20 ms
------------------------------------------------------------

感想！

たくさんOCRするのであれば、GPU＋バッチが大正義！
CPU推論だったら1枚ずつ実行するのが良さげ。CPU推論でもそこまで遅くなさそう。

アプリ実装する時にはGPUでバッチかな？