forked from ocropus-archive/DUP-ocropy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocropus-hocr
executable file
·161 lines (125 loc) · 5.21 KB
/
ocropus-hocr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
import __builtin__ as python
import random as pyrandom
import sys
import os.path
import re
import glob
import argparse
import codecs
import numpy as np
from matplotlib.pyplot import imread
import ocrolib
from ocrolib import hocr
parser = argparse.ArgumentParser("""
Construct an HTML output file in hOCR format by putting together
the recognition results for each page in sequence.
You should usually invoke this program as
ocropus-hocr 'book/????.bin.png'
For each page like 'book/0001.bin.png', it uses the following files:
book/0001.bin.png # page image
book/0001.pseg.png # page segmentation
book/0001/010001.txt # recognizer output for lines
""")
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)
ostream = codecs.open(args.output,"w","utf-8")
def E(*args):
args = [str(x) for x in args]
sys.stderr.write(" ".join(args))
sys.stderr.write("\n")
def P(*args):
ostream.write("".join(args)+"\n")
def PN(*args):
ostream.write("".join(args))
E("writing to",args.output)
median_xheight = None
dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
if len(xhfiles)>5:
xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
if len(xheights)>0:
median_xheight = np.median(xheights)
else:
lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
pyrandom.shuffle(lfiles)
if len(lfiles)>0:
median_xheight = 0.5*np.median([imread(f).shape[0] for f in lfiles[:100]])
E("median_xheight",median_xheight)
P(hocr.header())
last_coords = None
for arg in args.files:
base,_ = ocrolib.allsplitext(arg)
try:
E("===",arg)
image = ocrolib.read_image_binary(arg)
height, width = image.shape
P("<div class='ocr_page' title='image %s; bbox 0 0 %d %d'>"%(arg,width,height))
# to proceed, we need a pseg file and a
# subdirectory containing text lines
if not os.path.exists(base+".pseg.png"):
E("%s: no such file"%(base+".pseg.png",))
continue
if not os.path.isdir(base):
E("%s: no such directory"%base)
continue
# iterate through the text lines in reading order, based
# on the page segmentation file
pseg = ocrolib.read_page_segmentation(base+".pseg.png")
regions = ocrolib.RegionExtractor()
regions.setPageLines(pseg)
for i in range(1,regions.length()):
# keep track of the bounding box information for each line
# and insert paragraph breaks as needed
id = regions.id(i)
y0,x0,y1,x1 = regions.bbox(i)
if last_coords is not None:
lx0,ly0 = last_coords
dx,dy = x0-lx0,y1-ly0
par = 0
if dy>0:
par = 0 # column break... moving upwards
else:
if median_xheight is not None:
if abs(dy)>5*median_xheight: par = 1 # whitespace separator
if dx>2*median_xheight: par = 1 # indented paragraph
if abs(dx)>10*median_xheight: par = 1 # something else
if par and not args.nopars: P("<p />")
last_coords = (x0,y0)
# get the text for the line itself
lbase = "%s/%06x"%(base,id)
if not os.path.exists(lbase+".txt"):
E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
continue
text = ocrolib.read_text(lbase+".txt")
text = re.sub(r'\&','\&',text)
text = re.sub(r'\<','\<',text)
# accumulate information for each line here
style = ""
info = ""
# estimate the font size for this line
if median_xheight is not None and os.path.exists(lbase+".xheight"):
xheight = float(ocrolib.read_text(lbase+".xheight"))
perc = int(np.clip(xheight*100.0/median_xheight,30,300))
perc = 10*((perc+5)//10)
if perc!=100:
style += "font-size:%d%%;"%perc
# output geometric information
info += "bbox %d %d %d %d"%(x0,y0,x1,y1)
if os.path.exists(lbase+".baseline"):
info += "; baseline "+ocrolib.read_text(lbase+".baseline")
# put it all together into a SPAN
PN("<span")
if style!="": PN(" style='"+style+"'")
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
if not args.nobreaks: P("<br />")
else: P()
finally:
P("</div>")
P(hocr.footer())
ostream.close()