"""
Post process markdown HTML file.
Ideally integrate as an add-on to:
http://www.freewisdom.org/projects/python-markdown/
But this works on output from the original tool so is useful as prototype.
Features:
* Added a title to the html doc based on the first H1 (title)
Very simplistic H1 must be ona single line as output by markdown
* Add a TOC - based on headers found, this is very simplistic, see:
* http://www.oac.uci.edu/indiv/ehood/htmltoc.doc.html
* http://www.katspace.org/tools/hypertoc/
My regex TOC finder (only handles single line headers as
generated by markdown, freaks on multi line and bad header defs:
[123456])>(?P.*)
SET IgnoreCase - haven't got multi line going yet
Currently ignores H1 headers in the TOC - this is selective - but does
create anchor tags for them.
* ? call initial markdown conversion? / Add as a NanoDom PostProcessor to python markdown.
* ? Integrate with tidy? Either use uTinyLib etc. or just call tidy exe directly (see cherrypy tidy filter)
* ? look at adding an internal/external CSS style sheet depending on flags? Tidy does this
"""
import sys
import getopt
def h1_to_title(infile=None, outfile=None):
#inlines = infile.read()
inlines = infile.readlines()
#infile.close()
title=''
# simple lame non-regex approach
for line in inlines:
if line.startswith('
'):
title=line[len(''):]
title=title[:title.find('
')]
"""
print '%s' % title
for line in inlines:
print line,
"""
outfile.write(' %s ' % title)
"""
for line in inlines:
outfile.write(line)
"""
html_string = "".join(inlines)
outfile.write(html_string)
def html_add_toc(infile=None, outfile=None,TOC_INCLUDE_MARKER=None):
if TOC_INCLUDE_MARKER is None:
TOC_INCLUDE_MARKER=''
inlines = infile.readlines()
import re
rawstr = r"""[123456])>(?P.*)"""
compile_obj = re.compile(rawstr, re.IGNORECASE| re.MULTILINE)
add_toc_to_html=False
outlines=[]
headers_list=[]
generated_anchor_id=0
for line in inlines:
# Simple line at a time, head def HAS to be on a single line or this fails
if line.find(TOC_INCLUDE_MARKER) != -1:
add_toc_to_html=True
"""
# 2 pieces of code to find groups
match_obj = compile_obj.search(line)
headers_info = match_obj.groups()
"""
# or single line:
# list of tuples
headers_info = compile_obj.findall(line)
if headers_info != [] :
headers_info = headers_info[0]
if headers_info[1].strip() !="":
html_anchor_name='MD_autoTOC_%d' % (generated_anchor_id)
html_anchor='' % (html_anchor_name)
headers_list.append( (headers_info[0], headers_info[1], html_anchor_name) )
outlines.append(html_anchor)
generated_anchor_id = generated_anchor_id + 1
outlines.append(line)
inlines = outlines
html_string = "".join(inlines)
if add_toc_to_html:
include_header_one_in_toc = True
include_header_one_in_toc = False
last_heading_type=0
toc_heading_type=2
toc_html=[]
toc_html.append("Table Of Contents" %(toc_heading_type, toc_heading_type))
'''
# nested TOC that isn't quite working yet :-(
for (heading_type, heading_title, html_anchor_name) in headers_list:
if include_header_one_in_toc or heading_type !='1' :
if heading_type > last_heading_type:
toc_html.append("")
elif heading_type == last_heading_type:
toc_html.append("")
else:
toc_html.append("
")
#print heading_type, "
%d %s" % (heading_type, heading_title)
#print "%s" % (heading_title)
toc_html.append( '%s' % (html_anchor_name, heading_title) )
last_heading_type=heading_type
toc_html.append("")
toc_html.append("
")
toc_html = ''.join(toc_html)
'''
toc_html.append("\n")
for (heading_type, heading_title, html_anchor_name) in headers_list:
if include_header_one_in_toc or heading_type !='1' :
indent_list=range(1, int(heading_type)-1)
for x in indent_list:
toc_html.append('')
toc_html.append( ' - %s
\n' % (html_anchor_name, heading_title) )
for x in indent_list:
toc_html.append('
')
last_heading_type=heading_type
toc_html.append("
\n")
toc_html.append("
")
toc_html = ''.join(toc_html)
html_string = html_string.replace(TOC_INCLUDE_MARKER, toc_html)
outfile.write(html_string)
def html_add_toc_old_and_busted(infile=None, outfile=None):
inlines = infile.readlines()
html_string = "".join(inlines)
'''
import re
rawstr = r"""[123456])>(?P.*)"""
embedded_rawstr = r"""(?i)[123456])>(?P.*)"""
# method 1: using a compile object
compile_obj = re.compile(rawstr, re.IGNORECASE| re.MULTILINE)
match_obj = compile_obj.search(html_string)
# method 2: using search function (w/ external flags)
#match_obj = re.search(rawstr, html_string, re.IGNORECASE| re.MULTILINE)
# method 3: using search function (w/ embedded flags)
#match_obj = re.search(embedded_rawstr, html_string)
# Retrieve group(s) from match_obj
#all_groups = match_obj.groups()
all_groups = None
# Retrieve group(s) by index
group_1 = match_obj.group(1)
group_2 = match_obj.group(2)
print "group_1", group_1
print "group_2", group_2
print "all_groups ", all_groups
print "findall", compile_obj.findall(html_string)
'''
TOC_INCLUDE_MARKER=''
TOC_INCLUDE_MARKER=''
if html_string.find(TOC_INCLUDE_MARKER) != -1:
# code to dump out TOC
# at this time, missing href links!
import re
rawstr = r"""[123456])>(?P.*)"""
embedded_rawstr = r"""(?i)[123456])>(?P.*)"""
compile_obj = re.compile(rawstr, re.IGNORECASE| re.MULTILINE)
headers_found = compile_obj.findall(html_string)
include_header_one_in_toc = True
include_header_one_in_toc = False
last_heading_type=0
toc_heading_type=2
toc_html=[]
toc_html.append("Table Of Contents" %(toc_heading_type, toc_heading_type))
toc_html.append("")
for (heading_type, heading_title) in headers_found:
if include_header_one_in_toc or heading_type !='1' :
if heading_type > last_heading_type:
toc_html.append("")
elif heading_type == last_heading_type:
toc_html.append("
")
else:
toc_html.append("")
#print heading_type, "%d %s" % (heading_type, heading_title)
#print "%s" % (heading_title)
toc_html.append("%s" % (heading_title))
last_heading_type=heading_type
toc_html.append("")
toc_html.append("
")
toc_html = ''.join(toc_html)
html_string = html_string.replace(TOC_INCLUDE_MARKER, toc_html)
outfile.write(html_string)
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main(argv=None):
try:
try:
opts, args = getopt.getopt(argv[1:], "t:o", ["help"])
except getopt.error, msg:
raise Usage(msg)
# more code, unchanged
except Usage, err:
print >>sys.stderr, err.msg
print >>sys.stderr, "for help use --help"
return 2
"""
print "opts = " + str(opts)
print "args = " + str(args)
return 1
"""
infile = sys.stdin
outfile = sys.stdout
if args != []:
infile_name = args[0]
infile = open(infile_name , "r")
else:
infile = sys.stdin
function_to_perform = h1_to_title
if opts != []:
for (flag, flag_param) in opts:
if flag == '-t':
if flag_param == 'toc':
function_to_perform = html_add_toc
if flag_param == 'title':
function_to_perform = h1_to_title
if flag == '-o':
outfile = open(flag_param, "w")
"""
outfile = sys.stdout
infile_name = '-'
if args != []:
infile_name = args[0]
inlines = []
import fileinput
for line in fileinput.input(infile_name):
inlines.append(line)
"""
#h1_to_title(infile=infile, outfile=outfile)
#html_add_toc(infile=infile, outfile=outfile)
function_to_perform(infile=infile, outfile=outfile)
if __name__ == "__main__":
sys.exit(main(argv=sys.argv))