""" Copyright (c) 2019-present NAVER Corp. MIT License """ # -*- coding: cp932 -*- import sys import os import time import argparse import torch import torch.nn as nn import torch.backends.cudnn as cudnn from torch.autograd import Variable from PIL import Image, ImageDraw, ImageFont from utils import dataIterator, load_dict, gen_sample, load_mapping from encoder_decoder import Encoder_Decoder import cv2 from skimage import io import numpy as np import craft_utils import imgproc import file_utils import json import zipfile import xml.etree.cElementTree as ET import xml.dom.minidom as minidom import codecs from craft import CRAFT from collections import OrderedDict def copyStateDict(state_dict): if list(state_dict.keys())[0].startswith("module"): start_idx = 1 else: start_idx = 0 new_state_dict = OrderedDict() for k, v in state_dict.items(): name = ".".join(k.split(".")[start_idx:]) new_state_dict[name] = v return new_state_dict def str2bool(v): return v.lower() in ("yes", "y", "true", "t", "1") def pil2cv(imgPIL): imgCV_RGB = np.array(imgPIL, dtype = np.uint8) imgCV_BGR = np.array(imgPIL)[:, :, ::-1] return imgCV_BGR def cv2pil(imgCV): imgCV_RGB = imgCV[:, :, ::-1] imgPIL = Image.fromarray(imgCV_RGB) return imgPIL def cv2_putChar(draw, char, x, y, fontPIL, colorRGB): draw.text(xy = (x,y), text = char, fill = colorRGB, font = fontPIL) def cv2_putText_1(img, text, org, fontFace, fontScale, color): min_x, max_x, min_y, max_y = org imgPIL = cv2pil(img) draw = ImageDraw.Draw(imgPIL) fontPIL = ImageFont.truetype(font = fontFace, size = fontScale) if max_x - min_x >= max_y- min_y: #horizontal line y = max_y x = min_x for char in text: cv2_putChar(draw, char, x, y, fontPIL, color ) w, h = draw.textsize(char, font = fontPIL) x += w + 10 else: #vertical line y = min_y x = max_x - 10 for char in text: cv2_putChar(draw, char, x, y, fontPIL, color ) w, h = draw.textsize(char, font = fontPIL) y += h + 10 imgCV = pil2cv(imgPIL) return imgCV parser = argparse.ArgumentParser(description='Kindai document Recognition') #params for text detection parser.add_argument('--trained_model', default='./pretrain/synweights_4600.pth', type=str, help='pretrained model') parser.add_argument('--text_threshold', default=0.7, type=float, help='text confidence threshold') parser.add_argument('--low_text', default=0.4, type=float, help='text low-bound score') parser.add_argument('--link_threshold', default=0.4, type=float, help='link confidence threshold') parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model') parser.add_argument('--canvas_size', default=1000, type=int, help='image size for inference') parser.add_argument('--mag_ratio', default=2, type=float, help='image magnification ratio') parser.add_argument('--poly', default=False, action='store_true', help='enable polygon type') parser.add_argument('--show_time', default=True, action='store_true', help='show processing time') parser.add_argument('--test_folder', default='/data/', type=str, help='folder path to input images') #params for text recognition parser.add_argument('--model_path', default='./pretrain/WAP_params.pkl', type=str) parser.add_argument('--dictionary_target', default='./pretrain/kindai_voc.txt', type=str) args = parser.parse_args() """ For test images in a folder """ image_list, _, _ = file_utils.get_files('./data/test') result_folder = './data/result/' if not os.path.isdir(result_folder): os.mkdir(result_folder) def test_net(net, image, text_threshold, link_threshold, low_text, cuda, poly): t0 = time.time() # resize img_resized, target_ratio, size_heatmap = imgproc.resize_aspect_ratio(image, args.canvas_size, interpolation=cv2.INTER_LINEAR, mag_ratio=args.mag_ratio) ratio_h = ratio_w = 1 / target_ratio # preprocessing x = imgproc.normalizeMeanVariance(img_resized) x = torch.from_numpy(x).permute(2, 0, 1) # [h, w, c] to [c, h, w] x = Variable(x.unsqueeze(0)) # [c, h, w] to [b, c, h, w] if cuda: x = x.cuda() # forward pass y, _ = net(x) # make score and link map score_text = y[0,:,:,0].cpu().data.numpy() score_link = y[0,:,:,1].cpu().data.numpy() t0 = time.time() - t0 t1 = time.time() # Post-processing boxes, polys = craft_utils.getDetBoxes(score_text, score_link, text_threshold, link_threshold, low_text, poly) # coordinate adjustment boxes = craft_utils.adjustResultCoordinates(boxes, ratio_w, ratio_h) polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h) for k in range(len(polys)): if polys[k] is None: polys[k] = boxes[k] t1 = time.time() - t1 # render results (optional) render_img = score_text.copy() render_img = np.hstack((render_img, score_link)) ret_score_text = imgproc.cvt2HeatmapImg(render_img) if args.show_time : print("\ninfer/postproc time : {:.3f}/{:.3f}".format(t0, t1)) return boxes, polys, ret_score_text def test(text_detection_modelpara, ocr_modelpara, dictionary_target): # load net net = CRAFT() # initialize print('Loading text detection model from checkpoint {}'.format(text_detection_modelpara)) if args.cuda: net.load_state_dict(copyStateDict(torch.load(text_detection_modelpara))) else: net.load_state_dict(copyStateDict(torch.load(text_detection_modelpara, map_location='cpu'))) if args.cuda: net = net.cuda() net = torch.nn.DataParallel(net) cudnn.benchmark = False params = {} params['n'] = 256 params['m'] = 256 params['dim_attention'] = 512 params['D'] = 684 params['K'] = 5748 params['growthRate'] = 24 params['reduction'] = 0.5 params['bottleneck'] = True params['use_dropout'] = True params['input_channels'] = 3 params['cuda'] = args.cuda # load model OCR = Encoder_Decoder(params) if args.cuda: OCR.load_state_dict(copyStateDict(torch.load(ocr_modelpara))) else: OCR.load_state_dict(copyStateDict(torch.load(ocr_modelpara, map_location='cpu'))) if args.cuda: #OCR = OCR.cuda() OCR = torch.nn.DataParallel(OCR) cudnn.benchmark = False OCR.eval() net.eval() # load dictionary worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk t = time.time() fontPIL = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf' # japanese font size = 40 colorBGR = (0,0,255) paper = ET.Element('paper') paper.set('xmlns', "http://codh.rois.ac.jp/modern-magazine/") # load data for k, image_path in enumerate(image_list[:]): print("Test image {:d}/{:d}: {:s}".format(k+1, len(image_list), image_path), end='\r') res_img_file = result_folder + "res_" + os.path.basename(image_path) #print (res_img_file, os.path.basename(image_path), os.path.exists(res_img_file)) #if os.path.exists(res_img_file): continue #image = imgproc.loadImage(image_path) '''image = cv2.imread(image_path, cv2.IMREAD_COLOR) image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) ret2,image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) height = image.shape[0] width = image.shape[1] scale = 1000.0/height H = int(image.shape[0] * scale) W = int(image.shape[1] * scale) image = cv2.resize(image , (W, H)) print(image.shape, image_path) cv2.imwrite(image_path, image) continue''' image = cv2.imread(image_path, cv2.IMREAD_COLOR) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) h, w = image.shape[0], image.shape[1] print(image_path) page = ET.SubElement(paper, "page") page.set('file', os.path.basename(image_path).replace('.jpg', '')) page.set('height', str(h)) page.set('width', str(w)) page.set('dpi', str(100)) page.set('number', str(1)) bboxes, polys, score_text = test_net(net, image, args.text_threshold, args.link_threshold, args.low_text, args.cuda, args.poly) text = [] localtions = [] for i, box in enumerate(bboxes): poly = np.array(box).astype(np.int32) min_x = np.min(poly[:,0]) max_x = np.max(poly[:,0]) min_y = np.min(poly[:,1]) max_y = np.max(poly[:,1]) if min_x < 0: min_x = 0 if min_y < 0: min_y = 0 #image = cv2.rectangle(image,(min_x,min_y),(max_x,max_y),(0,255,0),3) input_img = image[min_y:max_y, min_x:max_x] w = max_x - min_x + 1 h = max_y - min_y + 1 line = ET.SubElement(page, "line") line.set("x", str(min_x)) line.set("y", str(min_y)) line.set("height", str(h)) line.set("width", str(w)) if w < h: rate = 20.0/w w = int(round(w*rate)) h = int(round(h* rate / 20.0) * 20) else: rate = 20.0/h w = int(round(w*rate / 20.0) * 20) h = int(round(h* rate)) #print (w, h, rate) input_img = cv2.resize(input_img, (w,h)) mat = np.zeros([1, h, w], dtype='uint8') mat[0,:,:] = 0.299* input_img[:, :, 0] + 0.587 * input_img[:, :, 1] + 0.114 * input_img[:, :, 2] xx_pad = mat.astype(np.float32) / 255. xx_pad = torch.from_numpy(xx_pad[None, :, :, :]) # (1,1,H,W) if args.cuda: xx_pad.cuda() with torch.no_grad(): sample, score, alpha_past_list = gen_sample(OCR, xx_pad, params, args.cuda, k=10, maxlen=600) score = score / np.array([len(s) for s in sample]) ss = sample[score.argmin()] alpha_past = alpha_past_list[score.argmin()] result = '' i = 0 location = [] for vv in ss: if vv == 0: # break alpha = alpha_past[i] if i != 0: alpha = alpha_past[i] - alpha_past[i-1] (y, x) = np.unravel_index(np.argmax(alpha, axis=None), alpha.shape) #print (int(16* x /rate), int(16* y/rate) , chr(int(worddicts_r[vv],16))) location.append([int(16* x/rate) + min_x, int(16* y/rate) + min_y]) #image = cv2.circle(image,(int(16* x/rate) - 8 + min_x, int(16* y/rate) + 8 + min_y),25, (0,0,255), -1) result += chr(int(worddicts_r[vv],16)) '''char = ET.SubElement(line, "char") char.set('num_cand', '1') char.set('x', str(int(16* x/rate) - 8 + min_x)) char.set('y', str(int(16* y/rate) + 8 + min_y)) res = ET.SubElement(char, "result") res.set('CC', str(100)) res.text = chr(int(worddicts_r[vv],16)) cand = ET.SubElement(char, "cand") cand.set('CC', str(100)) cand.text = chr(int(worddicts_r[vv],16))''' i+=1 line.text = result text.append(result) localtions.append(location) image = cv2_putText_1(img = image, text = result, org = (min_x, max_x, min_y, max_y), fontFace = fontPIL, fontScale = size, color = colorBGR) print('save image') # save score text filename, file_ext = os.path.splitext(os.path.basename(image_path)) mask_file = result_folder + "/res_" + filename + '_mask.jpg' #cv2.imwrite(mask_file, score_text) file_utils.saveResult(image_path, image, polys, dirname=result_folder) xml_string = ET.tostring(paper, 'Shift_JIS') fout = codecs.open('./data/result.xml', 'w', 'shift_jis') fout.write(xml_string.decode('shift_jis')) fout.close() print("elapsed time : {}s".format(time.time() - t)) if __name__ == "__main__": test(args.trained_model, args.model_path, args.dictionary_target)