图像上显示阿拉伯文文本有问题

ctrmrzij  于 2021-08-20  发布在  Java
关注(0)|答案(0)|浏览(243)
from PIL import ImageFont, ImageDraw, Image, ImageOps
import matplotlib.pyplot as plt
import os
import numpy as np
from datetime import datetime as dt
import arabic_reshaper
from bidi.algorithm import get_display

def data_gen(font_folder, font_list,
             data_str = ["0 1 2 3 4 5 6 7 8 9"],  
             img_size = [12000, 64],font_size=[20]):

    color = 255 
    count = 0

    #list to store images and labels
    all_roi = []
    all_labels = []

    #loop over to generate data
    #loope over fonts
    for i,f in enumerate(font_list):

        print("\nfont",i+1, ":", f)

        #iterate over angles
        for size in font_size:
            for word in data_str:
                #generate image
                #blank image
                font = ImageFont.truetype(font_folder + str(f), size,encoding='UTF-16')
                text = arabic_reshaper.reshape(word)
                bidi = get_display(text)
                img = Image.new('L', img_size, "black")
                #position of text
                x = 20
                y = img_size[1]//2 - size//2
                #draw text on image
                draw = ImageDraw.Draw(img)
                draw.text((x, y), bidi, fill = 255, font=font) 
                #convert image from PIL format to array
                img = np.array(img)
                #cv2.imshow('test',img)
                #cv2.waitKey(0)
                #find contours          
                rois = []
                labels = []
                cv2.imwrite(r'OCR/seg/'+"letter "+str(count)+".png",img)
                count= count+1

    return all_roi, all_labels
all_let = " ا إ ب ت ث ج چ ح خ د ذ ر ز س ش ص ض ط ظ ع غ ڠ ف ڤ ق ک ݢ ل م ن و ۏ ه ة ء ي ڽ ى "
all_let_styles = "ـا بـ ـبـ ــب تـ ـتـ ـت ثـ ـثـ ـث جـ ـجـ ـج چـ ـچـ ـچ حـ ـح ـحـ خـ ـخـ ـخ \
ـد ـذ ـر ـز سـ ـسـ ـس شـ ـشـ ـش صـ ـصـ ـص ضـ ـضـ ـض طـ ـطـ ـط ظـ ـظـ ـظ عـ ـعـ ـع غـ ـغـ ـغ ڠـ ـڠـ ـڠ فـ ـفـ ـف ڤـ ـڤـ ـڤ \
قـ ـقـ ـق کـ ـکـ ـک ݢـ ـݢـ ـݢ لـ ـلـ ـل مـ ـمـ ـم نـ ـنـ ـن ـو ـۏ هـ ـهـ ـه يـ ـيـ ـي ـى ڽـ ـڽـ ـڽ ـة"
digit = ' 0 1 2 3 4 5 6 7 8 9' 
signs = ''' ! @ # % ^ & ? / ( ) { } [ ] < > * - + = \ : ; ' . '''
ds = all_let + all_let_styles+digit+signs

dataset = list(all_let_styles.split(' '))

# Change this to the current user folder that holds the fonts

font_folder = r'/fonts/'
font_list = os.listdir(font_folder)
ims = [64, 64] 

# print("data string:", dataset)

# print("total number of character:", len(dataset)/2)

data, labels = data_gen(font_folder, font_list,
                       data_str = dataset, 
                       img_size = ims)

print("\nlength of image list:", len(data))
print("length of label list:", len(labels))

好的,我正在尝试创建一个ocr,这是我的代码片段,用于建立一个在黑色图像上绘制的jawi(阿拉伯语书写系统)字符数据集,我当前的问题是一些不在官方阿拉伯语系统中的字母无法正确显示,一些字母有奇怪的显示,我将附上示例,有人知道问题出在哪里吗?我是一个新手程序员,很抱歉给您带来不便。例1例2

暂无答案!

目前还没有任何答案,快来回答吧!

相关问题