ocropus-hocr

#!/usr/bin/env python

import __builtin__ as python
import random as pyrandom
import sys
import os.path
import re
import glob
import argparse
import codecs

import numpy as np
from matplotlib.pyplot import imread

import ocrolib
from ocrolib import hocr

parser = argparse.ArgumentParser("""
Construct an HTML output file in hOCR format by putting together
the recognition results for each page in sequence.
You should usually invoke this program as 

    ocropus-hocr 'book/????.bin.png'

For each page like 'book/0001.bin.png', it uses the following files:

    book/0001.bin.png            # page image
    book/0001.pseg.png           # page segmentation
    book/0001/010001.txt         # recognizer output for lines
""")
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)

ostream = codecs.open(args.output,"w","utf-8")

def E(*args):
    args = [str(x) for x in args]
    sys.stderr.write(" ".join(args))
    sys.stderr.write("\n")
def P(*args):
    ostream.write("".join(args)+"\n")
def PN(*args):
    ostream.write("".join(args))

E("writing to",args.output)
median_xheight = None
dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
if len(xhfiles)>5:
    xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
    if len(xheights)>0:
        median_xheight = np.median(xheights)
else:    
    lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
    pyrandom.shuffle(lfiles)
    if len(lfiles)>0:
        median_xheight = 0.5*np.median([imread(f).shape[0] for f in lfiles[:100]])
E("median_xheight",median_xheight)

P(hocr.header())

last_coords = None

for arg in args.files:
    base,_ = ocrolib.allsplitext(arg)
    try:
        E("===",arg)
        image = ocrolib.read_image_binary(arg)
        height, width = image.shape
        P("<div class='ocr_page' title='image %s; bbox 0 0 %d %d'>"%(arg,width,height))

        # to proceed, we need a pseg file and a
        # subdirectory containing text lines

        if not os.path.exists(base+".pseg.png"):
            E("%s: no such file"%(base+".pseg.png",))
            continue

        if not os.path.isdir(base):
            E("%s: no such directory"%base)
            continue

        # iterate through the text lines in reading order, based
        # on the page segmentation file

        pseg = ocrolib.read_page_segmentation(base+".pseg.png")
        regions = ocrolib.RegionExtractor()
        regions.setPageLines(pseg)
        for i in range(1,regions.length()):

            # keep track of the bounding box information for each line
            # and insert paragraph breaks as needed

            id = regions.id(i)
            y0,x0,y1,x1 = regions.bbox(i)
            if last_coords is not None:
                lx0,ly0 = last_coords
                dx,dy = x0-lx0,y1-ly0
                par = 0
                if dy>0: 
                    par = 0 # column break... moving upwards
                else:
                    if median_xheight is not None:
                        if abs(dy)>5*median_xheight: par = 1 # whitespace separator
                        if dx>2*median_xheight: par = 1 # indented paragraph
                        if abs(dx)>10*median_xheight: par = 1 # something else
                if par and not args.nopars: P("<p />")
            last_coords = (x0,y0)

            # get the text for the line itself

            lbase = "%s/%06x"%(base,id)

            if not os.path.exists(lbase+".txt"):
                E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
                continue

            text = ocrolib.read_text(lbase+".txt")

            text = re.sub(r'\&','\&amp;',text)
            text = re.sub(r'\<','\&lt;',text)

            # accumulate information for each line here

            style = ""
            info = ""

            # estimate the font size for this line

            if median_xheight is not None and os.path.exists(lbase+".xheight"):
                xheight = float(ocrolib.read_text(lbase+".xheight"))
                perc = int(np.clip(xheight*100.0/median_xheight,30,300))
                perc = 10*((perc+5)//10)
                if perc!=100:
                    style += "font-size:%d%%;"%perc

            # output geometric information 

            info += "bbox %d %d %d %d"%(x0,y0,x1,y1)
            if os.path.exists(lbase+".baseline"):
                info += "; baseline "+ocrolib.read_text(lbase+".baseline")

            # put it all together into a SPAN

            PN("<span")
            if style!="": PN(" style='"+style+"'")
            PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
            if not args.nobreaks: P("<br />")
            else: P()

    finally:
        P("</div>")

P(hocr.footer())

ostream.close()