#!/usr/bin/env python
# (works in both Python 2 and Python 3)
# Online HTML Indexer v1.38 (c) 2013-18,2020,2022-24 Silas S. Brown.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# See comments in ohi.py for what this is about.
# Although the offline files will also work ONline, in
# bandwidth-limited situations you might be better using
# this lookup CGI. This version can also take multiple
# adjacent anchors, giving alternate labels to the same
# fragment; there should not be any whitespace between
# adjacent anchors.
# Limited support for the Gemini protocol is also available.
# Configuration
# -------------
# You can change these variables here, but if you do then
# it might be more difficult to upgrade to newer versions
# of this script. However, any file called ohi_config.py
# (in the current directory or 1 level up) will be read.
html_filename = "input.html" # set this to whatever
# - and when that file changes, this script will update
# files with that plus .index, .header and .footer
# (it might be a good idea to do a separate run of this
# script, from the command line, to perform that update,
# especially if you're on a slow machine and the webserver
# has a short timeout; note that the update does NOT have
# to be done on the same machine, as long as the resulting
# files can be copied across)
# (another speedup is to get a small wrapper script to
# import ohi_online; the compiled version can then be used
# after the first time)
alphabet = "abcdefghijklmnopqrstuvwxyz" # set to None for all characters and case-sensitive; any headings not containing ANY of these characters will be put in as-is anyway
# ignore_text_in_parentheses NOT available in the online version because it could make it impossible to fetch entries that differ from others only in parenthetical additions (unless you merge the entries, which might not be a good idea)
more_sensible_punctuation_sort_order = True
remove_utf8_diacritics = True # or False, for removing diacritics in index headings (not in main text);
# assumes UTF-8. (Letters with diacritics will be treated as though they did not have any.)
frontpage_lookup_prompt = "Lookup: "
shorter_lookup_prompt = "Lookup: "
lines_before = 5 ; lines_after = 10
max_show_more = 50 ; increment = 10
between_before_and_after = "
"
# For more compactness, try this instead:
# between_before_and_after = " | "
# (depends on what sort of data you have though)
# You can override these functions if you want:
def preprocess_result(markup): return markup
def links_to_related_services(query): return "" # e.g. "Here | Somewhere else"
code_to_run_when_DOM_changes = ""
# you can set this to any Javascript to run after our JS
# manages to change the DOM (on capable browsers), e.g. to
# fix some typography when browser support is detected
web_adjuster_extension_mode = False
# If set to True, this module's handle() will work - see
# Web Adjuster 'extensions' option for more details.
# If set to False, we just behave as a CGI script.
web_adjuster_extension_url = "http://example.org/ohi.cgi"
web_adjuster_extension_url2 = "http://localhost/ohi.cgi"
# Where to find history:
# on GitHub at https://github.com/ssb22/indexer
# and on GitLab at https://gitlab.com/ssb22/indexer
# and on BitBucket https://bitbucket.org/ssb22/indexer
# and at https://gitlab.developers.cam.ac.uk/ssb22/indexer
# and in China: https://gitee.com/ssb22/indexer
# ------------------------------------------
# allow overrides:
import sys, os ; sys.path = ['.','..'] + sys.path
sp = os.environ.get("SCRIPT_PATH","")
if '/' in sp: sys.path=[sp[:sp.rindex('/')],sp[:sp.rfind('/',0,sp.rindex('/'))]]+sys.path # e.g. /var/gemini/cgi-bin and /var/gemini (although symlinking to a home directory is usually better)
try: import ohi_config
except ImportError: ohi_config = None
gemini_mode = os.environ.get("SERVER_PROTOCOL","")=="GEMINI"
if not web_adjuster_extension_mode and not gemini_mode:
import cgitb ; cgitb.enable() # remove this if you don't want tracebacks in the browser
import mmap, os, cgi, re
try: from urllib import quote # Python 2
except ImportError: from urllib.parse import quote # Python 3
if ohi_config:
ohi_config.quote = quote # so functions there can use it
from ohi_config import *
try: xrange
except: xrange = range # Python 3
def B(s):
if type(s)==type(u""): return s.encode('utf-8')
else: return s
def create_linemap(fName):
f = open(fName,"rb")
lm = LineMap(f.fileno(), 0, access=mmap.ACCESS_READ)
lm.f = f # ensure not closed by gc
return lm
class LineMap(mmap.mmap): # might fail in old Python versions where mmap isn't a class
def linesAround(self,txt,linesBefore,linesAfter):
"returns (before,line,after), up to numLines lines either side of the line appropriate for txt"
self.seek(self.bisect(txt))
linesBefore = sum(self.back_line() for i in xrange(linesBefore))
return [self.readline() for i in xrange(linesBefore)],self.readline(),[x for x in [self.readline() for i in xrange(linesAfter)] if x]
def bisect(self,txt,lo=0,hi=-1):
"returns pos of start of appropriate line"
txt = B(txt)
if hi==-1: hi=len(self)
elif hi <= lo:
# return self.lineStart(hi)
# amendment: if only the first few characters matched, it's possible that the PREVIOUS entry will match more characters (positioning is rarely helped by an inserted character, e.g. a pinyin shen/sheng confusion, and we probably want to draw more attention to the previous entries in this case, especially if the following entries are completely different e.g. 'shi'; TODO: could even do full 'first entry that matches as many characters as possible' logic)
ret = self.lineStart(hi)
if ret==0 or self[ret:ret+len(txt)]==txt: return ret # all characters match current line, or there are no previous lines
txt2 = txt
while len(txt2)>1 and not self[ret:ret+len(txt2)]==txt2: txt2 = txt2[:-1] # delete characters from the end until all that are left match current line
ret2 = self.lineStart(ret-1)
if self[ret2:ret2+len(txt2)+1]==txt[:len(txt2)+1]: return ret2 # return previous line if they match that as well
else: return ret
lWidth,uWidth = int((hi-lo)/2),int((hi-lo+1)/2)
lMid = self.lineStart(lo+lWidth)
lLine = self.lineAt(lMid)
if lLine < txt: return self.bisect(txt,lMid+len(lLine),hi)
else: return self.bisect(txt,lo,lMid)
def lineStart(self,pos):
return self.rfind(B("\n"),0,pos)+1 # (for start of file, rfind will return -1 so this+1 is still what we want)
def lineAt(self,pos):
self.seek(pos) ; return self.readline()
def back_line(self):
p = self.tell()
if not p: return 0
elif self[p-1:p]==B('\n'):
self.seek(self.lineStart(p-1))
else: self.seek(self.lineStart(p))
return 1
if alphabet and more_sensible_punctuation_sort_order: alphaOnly = lambda x: re.sub('([;,]);+',r'\1',''.join(c for c in x.lower().replace('-',' ').replace(',','~COM~').replace(';',',').replace('~COM~',';').replace(' ',';') if c in alphabet+',;')) # gives ; < , == space (useful if ; is used to separate definitions and , is used before extra words to be added at the start; better set space EQUAL to comma, not higher, or will end up in wrong place if user inputs something forgetting the comma)
elif alphabet: alphaOnly = lambda x: ''.join(c for c in x.lower() if c in alphabet)
elif more_sensible_punctuation_sort_order: alphaOnly = lambda x: re.sub('([;,]);+',r'\1',x.replace('-',' ').replace(',','~COM~').replace(';',',').replace('~COM~',';').replace(' ',';'))
else: alphaOnly = lambda x:x
def ST(x):
if type(x)==type(""): return x # Python 2
return x.decode('utf-8') # Python 3
if more_sensible_punctuation_sort_order: undo_alphaOnly_swap = lambda x:ST(x).replace(';',' ').replace(',',';')
else: undo_alphaOnly_swap = lambda x:x
def U(s):
if type(s)==type(u""): return s
return s.decode('utf-8')
def S(s):
if type(u"")==type(""): return s # Python 3
else: return s.encode('utf-8') # Python 2
if remove_utf8_diacritics:
_ao = alphaOnly ; import unicodedata
alphaOnly = lambda x: _ao(S(u''.join((c for c in unicodedata.normalize('NFD',U(x)) if not unicodedata.category(c).startswith('M')))))
def load(fName):
txt = create_linemap(fName)
try:
if os.stat(fName).st_mtime <= os.stat(fName+".index").st_mtime:
return txt,create_linemap(fName+".index"),open(fName+".header").read(),open(fName+".footer").read()
except OSError: pass
ret = {}
contentStart = 0 ; header="" ; tag = ""
altTags = []
for m in re.finditer(B(r''),txt):
# First, output the content from the PREVIOUS tag:
if contentStart and contentStart==m.start():
# oops, previous tag has NO content, so treat it as an 'alternate heading' to the tag we're about to have:
altTags.append(tag)
else:
for ttag in [tag]+altTags:
tag2 = alphaOnly(ttag)
if not tag2: tag2 = ttag
if contentStart:
if not tag2 in ret: ret[tag2] = (ttag,[])
ret[tag2][1].append("\t"+str(contentStart)+"\t"+str(m.start()))
else: # we're on the first tag
assert not altTags
header=txt[:m.start()]
if type(u"")==type(""): header=header.decode('utf-8') # Python 3
altTags = []
# Now look at the new tag:
tag = m.group(1) ; contentStart = m.end()
if type(u"")==type(""): tag=tag.decode('utf-8') # Python 3
footer = txt[contentStart:]
if type(u"")==type(""): footer=footer.decode('utf-8') # Python 3
if not header.strip(): header='
'+link(line,q)+' |