#!/usr/bin/env python
# (works on both Python 2 and Python 3)
"""ohi_latex: Offline HTML Indexer for LaTeX
v1.41 (c) 2014-20,2023-24 Silas S. Brown
License: Apache 2""" # (see below)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# See comments in ohi.py for what this is about.
# This version basically takes the same input and uses
# pdflatex to make a PDF from it instead of HTML fragments.
# Includes a simple HTML to LaTeX converter with support for
# CJK (including Pinyin), Greek, Braille, IPA, Latin diacritics
# and miscellaneous symbols. You could use this alone by
# giving standard input without any 'a name' tags.
from optparse import OptionParser
opts = OptionParser()
opts.add_option("--infile",
help="Input file (defaults to standard input)")
opts.add_option("--outfile",default="index.tex",
help="Output file (use - for standard output, or set a filename for pdflatex to be run on it also)")
opts.add_option("--lulu",action="store_true",default=False,help="Use page settings for Lulu's Letter-size printing service (max 740 pages per volume, tested 2015-05)")
opts.add_option("--createspace",action="store_true",default=False,help="Use page settings for CreateSpace's 7.5x9.25in printing service (max 828 pages per volume, not tested)")
opts.add_option("--a4compact",action="store_true",default=False,help="Use page settings that should work on most laser printers and MIGHT be ok for binding depending on who's doing it")
opts.add_option("--a5",action="store_true",default=False,help="Use page settings intended for 'on-screen only' use on small devices")
opts.add_option("--compromise",action="store_true",default=False,help="Use page settings intended for compromise between A4 and Letter, with a more spacious layout")
opts.add_option("--dry-run",action="store_true",default=False,help="Don't run pdflatex or qpdf")
opts.add_option("--no-open",action="store_true",default=False,help="Don't open the resulting PDF on Mac")
opts.add_option("--version",action="store_true",default=False,help="Show version number and exit")
options, args = opts.parse_args()
assert not args,"Unknown arguments: "+repr(args)
globals().update(options.__dict__)
if outfile=="-": outfile = None
if lulu:
if outfile=="index.tex":
outfile = "index-lulu.tex"
geometry = "paperwidth=8.5in,paperheight=11in,twoside,inner=0.8in,outer=0.5in,tmargin=0.5in,bmargin=0.5in,columnsep=8mm,includehead,headsep=0pt" # TODO: reduce headheight ?
multicol=r"\columnsep=14pt\columnseprule=.4pt"
twocol_columns = 3
page_headings = True # taken from the anchors (make sure 'includehead' is in geometry if using this)
whole_doc_in_footnotesize=True # if desperate to reduce page count (magnifier needed!) - I assume fully-sighted people will be OK with this for reading SHORT sections of text (e.g. dictionary lookups) because footnotesize was designed for short footnotes (and I've seen at least one commercial dictionary which fits 9 lines to the inch i.e. 8pt; footnotesize is 2pt less than the doc size, i.e. 8pt for the default 10pt if nothing is in class_options below)
links_and_bookmarks = False # as it seems submitting a PDF with links and bookmarks increases the chance of failure in bureau printing
remove_adjacent_see=2 # if you have a lot of alternate headings (with tags ending *) that just say "see" some other heading, you can automatically remove any that turn out to be right next to what they refer to (or to other alternates that refer to the same place), or that are within N entries of such (set to 0 to turn this off, 1 for right next to, 2 for next to but one, etc)
suppress_adjacent_see = 1 # to save a bit more, suppress 'see' when it occurs after this number of times in succession (0 = unlimited)
class_options="" # (maybe set 12pt if the default is not too close to the page limit)
elif createspace:
if outfile=="index.tex":
outfile = "index-createspace.tex"
geometry = "paperwidth=7.5in,paperheight=9.25in,twoside,inner=0.8in,outer=0.5in,tmargin=0.5in,bmargin=0.5in,columnsep=8mm,includehead,headsep=0pt" # inner=0.75in but suggest more if over 600 pages
multicol=r"\columnsep=14pt\columnseprule=.4pt"
twocol_columns = 2 # or 3 at a push
page_headings = True
whole_doc_in_footnotesize=True ; links_and_bookmarks = False ; class_options="" ; remove_adjacent_see = 2 ; suppress_adjacent_see = 1 # (see 'lulu' above for these 5)
elif a4compact:
if outfile=="index.tex":
outfile = "index-a4compact.tex"
geometry = "a4paper,twoside,inner=0.8in,outer=10mm,tmargin=10mm,bmargin=10mm,columnsep=8mm,includehead,headsep=0pt"
multicol=r"\columnsep=14pt\columnseprule=.4pt"
twocol_columns = 3
page_headings = True
whole_doc_in_footnotesize=True ; links_and_bookmarks = False ; class_options="" ; remove_adjacent_see = 2 ; suppress_adjacent_see = 1 # (see 'lulu' above for these 5)
elif a5:
geometry = "a5paper,lmargin=3mm,rmargin=3mm,tmargin=3mm,bmargin=3mm,columnsep=8mm"
multicol=""
twocol_columns = 2
page_headings = False
whole_doc_in_footnotesize=False
links_and_bookmarks = True
remove_adjacent_see = 0
suppress_adjacent_see = 0
class_options="12pt"
elif compromise:
geometry = "a4paper,paperheight=11in,lmargin=38mm,rmargin=38mm,tmargin=30mm,bmargin=46mm,columnsep=10mm"
multicol="" ; twocol_columns = 2
page_headings=whole_doc_in_footnotesize=False
links_and_bookmarks = True
remove_adjacent_see=suppress_adjacent_see=0
class_options="12pt"
else:
# these settings should work on most laser printers but I don't know about binding; should be OK for on-screen use
geometry = "a4paper,lmargin=10mm,rmargin=10mm,tmargin=10mm,bmargin=15mm,columnsep=8mm"
multicol=""
twocol_columns = 2
page_headings = False # TODO: ? (add includehead to the geometry if setting True)
whole_doc_in_footnotesize=False
links_and_bookmarks = True
remove_adjacent_see = 0
suppress_adjacent_see = 0
class_options="12pt"
# You probably don't want to change the below for the print version:
alphabet = "abcdefghijklmnopqrstuvwxyz" # set to None for all characters and case-sensitive
ignore_text_in_parentheses = True # or False, for parentheses in index headings
more_sensible_punctuation_sort_order = True
remove_utf8_diacritics = True # for sorting purposes only
# Where to find history:
# on GitHub at https://github.com/ssb22/indexer
# and on GitLab at https://gitlab.com/ssb22/indexer
# and on BitBucket https://bitbucket.org/ssb22/indexer
# and at https://gitlab.developers.cam.ac.uk/ssb22/indexer
# and in China: https://gitee.com/ssb22/indexer
# --------------------------------------------------------
try: import htmlentitydefs # Python 2
except ImportError: # Python 3
import html.entities as htmlentitydefs
xrange,unichr,unicode = range,chr,str
import unicodedata,re,sys,os
try: from string import letters # Python 2
except: from string import ascii_letters as letters # Python 3
def makeLatex(unistr):
"Convert unistr into a LaTeX document"
# init the lookup stuff INSIDE this function,
# so it's not done unless makeLatex is actually used
sys.stderr.write("makeLatex initialising... ")
simple_html2latex_noregex = {
# we add a non-standard 'twocols' tag:
'
':r'\vskip \medskipamount{}'+'\n', '\n':' ', '
':'', # assumes there'll be another
'':'\n', # works if no newlines in the comment
'':EmOn, '':EmOn, # track it for CJK also
'':r'\bf{}', '':r'\bf{}',
'':EmOff, '':EmOff, '':r'\rm{}', '':r'\rm{}', # assumes well-formed
'':r'\uline{','':'}',
'':r'\sout{','':'}', '':'}',
'':r"\Large{}",
'':r"\offinterlineskip\lineskip2pt\footnotesize{}", # (The 'offinterlineskip' turns off the normal line spacing and makes line spacing effectively irregular depending on the height of each line; can be useful for saving paper if you have lots of additional text in 'small'; not sure if there's a way to turn interline skip back on for the rest of the paragraph etc)
'':r"\normalsize{}",'':r"\normalsize{}",
'':r'\sout{',
'
or whatever '
':r'{\tt ','
':'}',
'':r'$^{\rm ','':'}$',
}
if whole_doc_in_footnotesize: simple_html2latex_noregex.update({"":r"\normalsize{}","":r"\footnotesize{}","