""" Post process markdown HTML file. Ideally integrate as an add-on to: http://www.freewisdom.org/projects/python-markdown/ But this works on output from the original tool so is useful as prototype. Features: * Added a title to the html doc based on the first H1 (title) Very simplistic H1 must be ona single line as output by markdown * Add a TOC - based on headers found, this is very simplistic, see: * http://www.oac.uci.edu/indiv/ehood/htmltoc.doc.html * http://www.katspace.org/tools/hypertoc/ My regex TOC finder (only handles single line headers as generated by markdown, freaks on multi line and bad header defs: [123456])>(?P.*) SET IgnoreCase - haven't got multi line going yet Currently ignores H1 headers in the TOC - this is selective - but does create anchor tags for them. * ? call initial markdown conversion? / Add as a NanoDom PostProcessor to python markdown. * ? Integrate with tidy? Either use uTinyLib etc. or just call tidy exe directly (see cherrypy tidy filter) * ? look at adding an internal/external CSS style sheet depending on flags? Tidy does this """ import sys import getopt def h1_to_title(infile=None, outfile=None): #inlines = infile.read() inlines = infile.readlines() #infile.close() title='' # simple lame non-regex approach for line in inlines: if line.startswith('

'): title=line[len('

'):] title=title[:title.find('

')] """ print '%s' % title for line in inlines: print line, """ outfile.write(' %s ' % title) """ for line in inlines: outfile.write(line) """ html_string = "".join(inlines) outfile.write(html_string) def html_add_toc(infile=None, outfile=None,TOC_INCLUDE_MARKER=None): if TOC_INCLUDE_MARKER is None: TOC_INCLUDE_MARKER='' inlines = infile.readlines() import re rawstr = r"""[123456])>(?P.*)""" compile_obj = re.compile(rawstr, re.IGNORECASE| re.MULTILINE) add_toc_to_html=False outlines=[] headers_list=[] generated_anchor_id=0 for line in inlines: # Simple line at a time, head def HAS to be on a single line or this fails if line.find(TOC_INCLUDE_MARKER) != -1: add_toc_to_html=True """ # 2 pieces of code to find groups match_obj = compile_obj.search(line) headers_info = match_obj.groups() """ # or single line: # list of tuples headers_info = compile_obj.findall(line) if headers_info != [] : headers_info = headers_info[0] if headers_info[1].strip() !="": html_anchor_name='MD_autoTOC_%d' % (generated_anchor_id) html_anchor='' % (html_anchor_name) headers_list.append( (headers_info[0], headers_info[1], html_anchor_name) ) outlines.append(html_anchor) generated_anchor_id = generated_anchor_id + 1 outlines.append(line) inlines = outlines html_string = "".join(inlines) if add_toc_to_html: include_header_one_in_toc = True include_header_one_in_toc = False last_heading_type=0 toc_heading_type=2 toc_html=[] toc_html.append("Table Of Contents" %(toc_heading_type, toc_heading_type)) ''' # nested TOC that isn't quite working yet :-( for (heading_type, heading_title, html_anchor_name) in headers_list: if include_header_one_in_toc or heading_type !='1' : if heading_type > last_heading_type: toc_html.append("
    ") elif heading_type == last_heading_type: toc_html.append("") else: toc_html.append("
") #print heading_type, "
  • %d %s" % (heading_type, heading_title) #print "
  • %s" % (heading_title) toc_html.append( '
  • %s' % (html_anchor_name, heading_title) ) last_heading_type=heading_type toc_html.append("") toc_html.append("
    ") toc_html = ''.join(toc_html) ''' toc_html.append("
      \n") for (heading_type, heading_title, html_anchor_name) in headers_list: if include_header_one_in_toc or heading_type !='1' : indent_list=range(1, int(heading_type)-1) for x in indent_list: toc_html.append('
        ') toc_html.append( '
      • %s
      • \n' % (html_anchor_name, heading_title) ) for x in indent_list: toc_html.append('
      ') last_heading_type=heading_type toc_html.append("
    \n") toc_html.append("
    ") toc_html = ''.join(toc_html) html_string = html_string.replace(TOC_INCLUDE_MARKER, toc_html) outfile.write(html_string) def html_add_toc_old_and_busted(infile=None, outfile=None): inlines = infile.readlines() html_string = "".join(inlines) ''' import re rawstr = r"""[123456])>(?P.*)""" embedded_rawstr = r"""(?i)[123456])>(?P.*)""" # method 1: using a compile object compile_obj = re.compile(rawstr, re.IGNORECASE| re.MULTILINE) match_obj = compile_obj.search(html_string) # method 2: using search function (w/ external flags) #match_obj = re.search(rawstr, html_string, re.IGNORECASE| re.MULTILINE) # method 3: using search function (w/ embedded flags) #match_obj = re.search(embedded_rawstr, html_string) # Retrieve group(s) from match_obj #all_groups = match_obj.groups() all_groups = None # Retrieve group(s) by index group_1 = match_obj.group(1) group_2 = match_obj.group(2) print "group_1", group_1 print "group_2", group_2 print "all_groups ", all_groups print "findall", compile_obj.findall(html_string) ''' TOC_INCLUDE_MARKER='' TOC_INCLUDE_MARKER='' if html_string.find(TOC_INCLUDE_MARKER) != -1: # code to dump out TOC # at this time, missing href links! import re rawstr = r"""[123456])>(?P.*)""" embedded_rawstr = r"""(?i)[123456])>(?P.*)""" compile_obj = re.compile(rawstr, re.IGNORECASE| re.MULTILINE) headers_found = compile_obj.findall(html_string) include_header_one_in_toc = True include_header_one_in_toc = False last_heading_type=0 toc_heading_type=2 toc_html=[] toc_html.append("Table Of Contents" %(toc_heading_type, toc_heading_type)) toc_html.append("
      ") for (heading_type, heading_title) in headers_found: if include_header_one_in_toc or heading_type !='1' : if heading_type > last_heading_type: toc_html.append("
        ") elif heading_type == last_heading_type: toc_html.append("") else: toc_html.append("
      ") #print heading_type, "
    • %d %s" % (heading_type, heading_title) #print "
    • %s" % (heading_title) toc_html.append("
    • %s" % (heading_title)) last_heading_type=heading_type toc_html.append("
    ") toc_html.append("
    ") toc_html = ''.join(toc_html) html_string = html_string.replace(TOC_INCLUDE_MARKER, toc_html) outfile.write(html_string) class Usage(Exception): def __init__(self, msg): self.msg = msg def main(argv=None): try: try: opts, args = getopt.getopt(argv[1:], "t:o", ["help"]) except getopt.error, msg: raise Usage(msg) # more code, unchanged except Usage, err: print >>sys.stderr, err.msg print >>sys.stderr, "for help use --help" return 2 """ print "opts = " + str(opts) print "args = " + str(args) return 1 """ infile = sys.stdin outfile = sys.stdout if args != []: infile_name = args[0] infile = open(infile_name , "r") else: infile = sys.stdin function_to_perform = h1_to_title if opts != []: for (flag, flag_param) in opts: if flag == '-t': if flag_param == 'toc': function_to_perform = html_add_toc if flag_param == 'title': function_to_perform = h1_to_title if flag == '-o': outfile = open(flag_param, "w") """ outfile = sys.stdout infile_name = '-' if args != []: infile_name = args[0] inlines = [] import fileinput for line in fileinput.input(infile_name): inlines.append(line) """ #h1_to_title(infile=infile, outfile=outfile) #html_add_toc(infile=infile, outfile=outfile) function_to_perform(infile=infile, outfile=outfile) if __name__ == "__main__": sys.exit(main(argv=sys.argv))