Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Development Issue] 试图引入Stariver(团子OCR)作为Detector与OCR工具,但输出时遇到了一些问题。 #611

Open
PiDanShouRouZhouXD opened this issue Apr 27, 2024 · 0 comments

Comments

@PiDanShouRouZhouXD
Copy link
Contributor

我在开发团子OCR的支持时,输出的文字的方向不正确。具体效果如下:

sosonofuriren_sample_01

sosonofuriren_sample_02

代码如下:

import os
import numpy as np
import cv2
import base64

from typing import List, Tuple
import requests
import json

from .common import CommonDetector
from ..utils import Quadrilateral

from dotenv import load_dotenv

load_dotenv()
STARRIVER_OCR_TOKEN = os.getenv('STARRIVER_OCR_TOKEN', '')

class StariverDetector(CommonDetector):


    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.token = STARRIVER_OCR_TOKEN
        if self.token is None:
            raise ValueError('团子OCR需要设置 STARRIVER_OCR_TOKEN 的环境变量。')
        self.url = 'https://dl.ap-sh.starivercs.cn/v2/manga_trans/advanced/manga_ocr'

    async def _detect(self, image: np.ndarray, detect_size: int, text_threshold: float, box_threshold: float,
                        unclip_ratio: float, verbose: bool = False) -> Tuple[List[Quadrilateral], np.ndarray, np.ndarray]:
        
        img_encoded = cv2.imencode('.jpg', image)[1]
        img_base64 = base64.b64encode(img_encoded).decode('utf-8')
        textlines: List[Quadrilateral] = []
        data = {
            "token": self.token,
            "mask": True,
            "refine": True,
            "filtrate": True,
            "disable_skip_area": True,
            "detect_scale": 3,
            "merge_threshold": 0.5,
            "low_accuracy_mode": False,
            "image": img_base64
        }

        response = requests.post(self.url, data=json.dumps(data))
        response_data = response.json()['Data']

        if verbose:
            with open('det_result.json', 'w', encoding='utf-8') as f:
                json.dump(response_data, f, ensure_ascii=False, indent=4)

        for block in response_data['text_block']:
            pts_from_ocr = np.array([
                block['block_coordinate']['upper_left'],
                block['block_coordinate']['upper_right'],
                block['block_coordinate']['lower_right'],
                block['block_coordinate']['lower_left']
            ])
            text = ''.join(block['texts'])
            
            if verbose:
                print(f"识别出来的文本块: {text}")

            # if block['is_vertical'] == 1:
            #     direction = 'v'
            # else:
            #     direction = 'h'

            fg_color = block['foreground_color']
            bg_color = block['background_color']

            textlines.append(Quadrilateral(pts=pts_from_ocr, text=text, prob=1.0, fg_r=fg_color[0], fg_g=fg_color[1], fg_b=fg_color[2], bg_r=bg_color[0], bg_g=bg_color[1], bg_b=bg_color[2]))

        raw_mask = self.base64_to_ndarray(response_data['mask'])
        
        # 处理mask
        mask_resized = cv2.resize(raw_mask, (raw_mask.shape[1] * 2, raw_mask.shape[0] * 2), interpolation=cv2.INTER_LINEAR)
        raw_mask = np.clip(mask_resized * 255, 0, 255).astype(np.uint8)

        return textlines, raw_mask, None
    
    @staticmethod
    def base64_to_ndarray(base64_str: str) -> np.ndarray:
        img_data = base64.b64decode(base64_str)
        img_array = np.frombuffer(img_data, np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)  # 修改为灰度图读取
        return img

请问这可能是什么原因导致的?应该检查代码的哪些部分?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant