markdoku.py

   1 # I took python-markdown2 and modified a few syntax elements to behave more
   2 # like dokuwiki.  The original python-markdown2 can be found at:
   3 #
   4 # https://github.com/trentm/python-markdown2
   5 #
   6 # -Drew Fisher
   7
   8 # Copyright (c) 2007-2008 ActiveState Corp.
   9 # License: MIT (http://www.opensource.org/licenses/mit-license.php)
  10
  11 r"""A fast and complete Python implementation of Markdown.
  12
  13 [from http://daringfireball.net/projects/markdown/]
  14 > Markdown is a text-to-HTML filter; it translates an easy-to-read /
  15 > easy-to-write structured text format into HTML.  Markdown's text
  16 > format is most similar to that of plain text email, and supports
  17 > features such as headers, *emphasis*, code blocks, blockquotes, and
  18 > links.
  19 >
  20 > Markdown's syntax is designed not as a generic markup language, but
  21 > specifically to serve as a front-end to (X)HTML. You can use span-level
  22 > HTML tags anywhere in a Markdown document, and you can use block level
  23 > HTML tags (like <div> and <table> as well).
  24
  25 Module usage:
  26
  27     >>> import markdown2
  28     >>> markdown2.markdown("*boo!*")  # or use `html = markdown_path(PATH)`
  29     u'<p><em>boo!</em></p>\n'
  30
  31     >>> markdowner = Markdown()
  32     >>> markdowner.convert("*boo!*")
  33     u'<p><em>boo!</em></p>\n'
  34     >>> markdowner.convert("**boom!**")
  35     u'<p><strong>boom!</strong></p>\n'
  36
  37 This implementation of Markdown implements the full "core" syntax plus a
  38 number of extras (e.g., code syntax coloring, footnotes) as described on
  39 <https://github.com/trentm/python-markdown2/wiki/Extras>.
  40 """
  41
  42 cmdln_desc = """A fast and complete Python implementation of Markdown, a
  43 text-to-HTML conversion tool for web writers.
  44
  45 Supported extras (see -x|--extras option below):
  46 * code-friendly: Disable _ and __ for em and strong.
  47 * code-color: Pygments-based syntax coloring of <code> sections.
  48 * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
  49 * footnotes: Support footnotes as in use on daringfireball.net and
  50   implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
  51 * header-ids: Adds "id" attributes to headers. The id value is a slug of
  52   the header text.
  53 * html-classes: Takes a dict mapping html tag names (lowercase) to a
  54   string to use for a "class" tag attribute. Currently only supports
  55   "pre" and "code" tags. Add an issue if you require this for other tags.
  56 * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
  57   have markdown processing be done on its contents. Similar to
  58   <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
  59   some limitations.
  60 * pyshell: Treats unindented Python interactive shell sessions as <code>
  61   blocks.
  62 * link-patterns: Auto-link given regex patterns in text (e.g. bug number
  63   references, revision number references).
  64 * smarty-pants: Replaces ' and " with curly quotation marks or curly
  65   apostrophes.  Replaces --, ---, ..., and . . . with en dashes, em dashes,
  66   and ellipses.
  67 * toc: The returned HTML string gets a new "toc_html" attribute which is
  68   a Table of Contents for the document. (experimental)
  69 * xml: Passes one-liner processing instructions and namespaced XML tags.
  70 * wiki-tables: Google Code Wiki-style tables. See
  71   <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
  72 """
  73
  74 # Dev Notes:
  75 # - There is already a Python markdown processor
  76 #   (http://www.freewisdom.org/projects/python-markdown/).
  77 # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
  78 #   not yet sure if there implications with this. Compare 'pydoc sre'
  79 #   and 'perldoc perlre'.
  80
  81 __version_info__ = (1, 0, 1, 19) # first three nums match Markdown.pl
  82 __version__ = '1.0.1.19'
  83 __author__ = "Trent Mick"
  84
  85 import os
  86 import sys
  87 from pprint import pprint
  88 import re
  89 import logging
  90 try:
  91     from hashlib import md5
  92 except ImportError:
  93     from md5 import md5
  94 import optparse
  95 from random import random, randint
  96 import codecs
  97 from urllib.parse import quote
  98
  99
 100
 101 #---- Python version compat
 102
 103 if sys.version_info[:2] < (2,4):
 104     from sets import Set as set
 105     def reversed(sequence):
 106         for i in sequence[::-1]:
 107             yield i
 108     def _unicode_decode(s, encoding, errors='xmlcharrefreplace'):
 109         return unicode(s, encoding, errors)
 110 else:
 111     def _unicode_decode(s, encoding, errors='strict'):
 112         return s.decode(encoding, errors)
 113
 114
 115 #---- globals
 116
 117 DEBUG = False
 118 log = logging.getLogger("markdown")
 119
 120 DEFAULT_TAB_WIDTH = 4
 121
 122
 123 try:
 124     import uuid
 125 except ImportError:
 126     SECRET_SALT = str(randint(0, 1000000)).encode('utf-8')
 127 else:
 128     SECRET_SALT = str(uuid.uuid4()).encode('utf-8')
 129 def _hash_ascii(s):
 130     #return md5(s).hexdigest()   # Markdown.pl effectively does this.
 131     return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
 132 def _hash_text(s):
 133     return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
 134
 135 # Table of hash values for escaped characters:
 136 g_escape_table = dict([(ch, _hash_ascii(ch))
 137                        for ch in '\\`*_{}[]()>#+-.!'])
 138
 139
 140
 141 #---- exceptions
 142
 143 class MarkdownError(Exception):
 144     pass
 145
 146
 147
 148 #---- public api
 149
 150 def markdown_path(path, encoding="utf-8",
 151                   html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
 152                   safe_mode=None, extras=None, link_patterns=None,
 153                   use_file_vars=False):
 154     fp = codecs.open(path, 'r', encoding)
 155     text = fp.read()
 156     fp.close()
 157     return Markdown(html4tags=html4tags, tab_width=tab_width,
 158                     safe_mode=safe_mode, extras=extras,
 159                     link_patterns=link_patterns,
 160                     use_file_vars=use_file_vars).convert(text)
 161
 162 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
 163              safe_mode=None, extras=None, link_patterns=None,
 164              use_file_vars=False):
 165     return Markdown(html4tags=html4tags, tab_width=tab_width,
 166                     safe_mode=safe_mode, extras=extras,
 167                     link_patterns=link_patterns,
 168                     use_file_vars=use_file_vars).convert(text)
 169
 170 class Markdown(object):
 171     # The dict of "extras" to enable in processing -- a mapping of
 172     # extra name to argument for the extra. Most extras do not have an
 173     # argument, in which case the value is None.
 174     #
 175     # This can be set via (a) subclassing and (b) the constructor
 176     # "extras" argument.
 177     extras = None
 178
 179     urls = None
 180     titles = None
 181     html_blocks = None
 182     html_spans = None
 183     html_removed_text = "[HTML_REMOVED]"  # for compat with markdown.py
 184
 185     # Used to track when we're inside an ordered or unordered list
 186     # (see _ProcessListItems() for details):
 187     list_level = 0
 188
 189     _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
 190
 191     def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
 192                  extras=None, link_patterns=None, use_file_vars=False):
 193         if html4tags:
 194             self.empty_element_suffix = ">"
 195         else:
 196             self.empty_element_suffix = " />"
 197         self.tab_width = tab_width
 198
 199         # For compatibility with earlier markdown2.py and with
 200         # markdown.py's safe_mode being a boolean,
 201         #   safe_mode == True -> "replace"
 202         if safe_mode is True:
 203             self.safe_mode = "replace"
 204         else:
 205             self.safe_mode = safe_mode
 206
 207         # Massaging and building the "extras" info.
 208         if self.extras is None:
 209             self.extras = {}
 210         elif not isinstance(self.extras, dict):
 211             self.extras = dict([(e, None) for e in self.extras])
 212         if extras:
 213             if not isinstance(extras, dict):
 214                 extras = dict([(e, None) for e in extras])
 215             self.extras.update(extras)
 216         assert isinstance(self.extras, dict)
 217         if "toc" in self.extras and not "header-ids" in self.extras:
 218             self.extras["header-ids"] = None   # "toc" implies "header-ids"
 219         self._instance_extras = self.extras.copy()
 220
 221         self.link_patterns = link_patterns
 222         self.use_file_vars = use_file_vars
 223         self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
 224
 225         self._escape_table = g_escape_table.copy()
 226         if "smarty-pants" in self.extras:
 227             self._escape_table['"'] = _hash_ascii('"')
 228             self._escape_table["'"] = _hash_ascii("'")
 229
 230     def reset(self):
 231         self.urls = {}
 232         self.titles = {}
 233         self.html_blocks = {}
 234         self.html_spans = {}
 235         self.list_level = 0
 236         self.extras = self._instance_extras.copy()
 237         if "footnotes" in self.extras:
 238             self.footnotes = {}
 239             self.footnote_ids = []
 240         if "header-ids" in self.extras:
 241             self._count_from_header_id = {} # no `defaultdict` in Python 2.4
 242
 243     def convert(self, text):
 244         """Convert the given text."""
 245         # Main function. The order in which other subs are called here is
 246         # essential. Link and image substitutions need to happen before
 247         # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 248         # and <img> tags get encoded.
 249
 250         # Clear the global hashes. If we don't clear these, you get conflicts
 251         # from other articles when generating a page which contains more than
 252         # one article (e.g. an index page that shows the N most recent
 253         # articles):
 254         self.reset()
 255
 256         if not isinstance(text, str):
 257             #TODO: perhaps shouldn't presume UTF-8 for string input?
 258             text = text.decode('utf-8')
 259
 260         if self.use_file_vars:
 261             # Look for emacs-style file variable hints.
 262             emacs_vars = self._get_emacs_vars(text)
 263             if "markdown-extras" in emacs_vars:
 264                 splitter = re.compile("[ ,]+")
 265                 for e in splitter.split(emacs_vars["markdown-extras"]):
 266                     if '=' in e:
 267                         ename, earg = e.split('=', 1)
 268                         try:
 269                             earg = int(earg)
 270                         except ValueError:
 271                             pass
 272                     else:
 273                         ename, earg = e, None
 274                     self.extras[ename] = earg
 275
 276         # Standardize line endings:
 277         text = re.sub("\r\n|\r", "\n", text)
 278
 279         # Make sure $text ends with a couple of newlines:
 280         text += "\n\n"
 281
 282         # Convert all tabs to spaces.
 283         text = self._detab(text)
 284
 285         # Strip any lines consisting only of spaces and tabs.
 286         # This makes subsequent regexen easier to write, because we can
 287         # match consecutive blank lines with /\n+/ instead of something
 288         # contorted like /[ \t]*\n+/ .
 289         text = self._ws_only_line_re.sub("", text)
 290
 291         if self.safe_mode:
 292             text = self._hash_html_spans(text)
 293
 294         # Turn block-level HTML blocks into hash entries
 295         text = self._hash_html_blocks(text, raw=True)
 296
 297         # Strip link definitions, store in hashes.
 298         if "footnotes" in self.extras:
 299             # Must do footnotes first because an unlucky footnote defn
 300             # looks like a link defn:
 301             #   [^4]: this "looks like a link defn"
 302             text = self._strip_footnote_definitions(text)
 303         text = self._strip_link_definitions(text)
 304
 305         text = self._run_block_gamut(text)
 306
 307         if "footnotes" in self.extras:
 308             text = self._add_footnotes(text)
 309
 310         text = self.postprocess(text)
 311
 312         text = self._unescape_special_chars(text)
 313
 314         if self.safe_mode:
 315             text = self._unhash_html_spans(text)
 316
 317         text += "\n"
 318
 319         rv = UnicodeWithAttrs(text)
 320         if "toc" in self.extras:
 321             rv._toc = self._toc
 322         return rv
 323
 324     def postprocess(self, text):
 325         """A hook for subclasses to do some postprocessing of the html, if
 326         desired. This is called before unescaping of special chars and
 327         unhashing of raw HTML spans.
 328         """
 329         return text
 330
 331     _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
 332     # This regular expression is intended to match blocks like this:
 333     #    PREFIX Local Variables: SUFFIX
 334     #    PREFIX mode: Tcl SUFFIX
 335     #    PREFIX End: SUFFIX
 336     # Some notes:
 337     # - "[ \t]" is used instead of "\s" to specifically exclude newlines
 338     # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
 339     #   not like anything other than Unix-style line terminators.
 340     _emacs_local_vars_pat = re.compile(r"""^
 341         (?P<prefix>(?:[^\r\n|\n|\r])*?)
 342         [\ \t]*Local\ Variables:[\ \t]*
 343         (?P<suffix>.*?)(?:\r\n|\n|\r)
 344         (?P<content>.*?\1End:)
 345         """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
 346
 347     def _get_emacs_vars(self, text):
 348         """Return a dictionary of emacs-style local variables.
 349
 350         Parsing is done loosely according to this spec (and according to
 351         some in-practice deviations from this):
 352         http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
 353         """
 354         emacs_vars = {}
 355         SIZE = pow(2, 13) # 8kB
 356
 357         # Search near the start for a '-*-'-style one-liner of variables.
 358         head = text[:SIZE]
 359         if "-*-" in head:
 360             match = self._emacs_oneliner_vars_pat.search(head)
 361             if match:
 362                 emacs_vars_str = match.group(1)
 363                 assert '\n' not in emacs_vars_str
 364                 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
 365                                   if s.strip()]
 366                 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
 367                     # While not in the spec, this form is allowed by emacs:
 368                     #   -*- Tcl -*-
 369                     # where the implied "variable" is "mode". This form
 370                     # is only allowed if there are no other variables.
 371                     emacs_vars["mode"] = emacs_var_strs[0].strip()
 372                 else:
 373                     for emacs_var_str in emacs_var_strs:
 374                         try:
 375                             variable, value = emacs_var_str.strip().split(':', 1)
 376                         except ValueError:
 377                             log.debug("emacs variables error: malformed -*- "
 378                                       "line: %r", emacs_var_str)
 379                             continue
 380                         # Lowercase the variable name because Emacs allows "Mode"
 381                         # or "mode" or "MoDe", etc.
 382                         emacs_vars[variable.lower()] = value.strip()
 383
 384         tail = text[-SIZE:]
 385         if "Local Variables" in tail:
 386             match = self._emacs_local_vars_pat.search(tail)
 387             if match:
 388                 prefix = match.group("prefix")
 389                 suffix = match.group("suffix")
 390                 lines = match.group("content").splitlines(0)
 391                 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
 392                 #      % (prefix, suffix, match.group("content"), lines)
 393
 394                 # Validate the Local Variables block: proper prefix and suffix
 395                 # usage.
 396                 for i, line in enumerate(lines):
 397                     if not line.startswith(prefix):
 398                         log.debug("emacs variables error: line '%s' "
 399                                   "does not use proper prefix '%s'"
 400                                   % (line, prefix))
 401                         return {}
 402                     # Don't validate suffix on last line. Emacs doesn't care,
 403                     # neither should we.
 404                     if i != len(lines)-1 and not line.endswith(suffix):
 405                         log.debug("emacs variables error: line '%s' "
 406                                   "does not use proper suffix '%s'"
 407                                   % (line, suffix))
 408                         return {}
 409
 410                 # Parse out one emacs var per line.
 411                 continued_for = None
 412                 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
 413                     if prefix: line = line[len(prefix):] # strip prefix
 414                     if suffix: line = line[:-len(suffix)] # strip suffix
 415                     line = line.strip()
 416                     if continued_for:
 417                         variable = continued_for
 418                         if line.endswith('\\'):
 419                             line = line[:-1].rstrip()
 420                         else:
 421                             continued_for = None
 422                         emacs_vars[variable] += ' ' + line
 423                     else:
 424                         try:
 425                             variable, value = line.split(':', 1)
 426                         except ValueError:
 427                             log.debug("local variables error: missing colon "
 428                                       "in local variables entry: '%s'" % line)
 429                             continue
 430                         # Do NOT lowercase the variable name, because Emacs only
 431                         # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
 432                         value = value.strip()
 433                         if value.endswith('\\'):
 434                             value = value[:-1].rstrip()
 435                             continued_for = variable
 436                         else:
 437                             continued_for = None
 438                         emacs_vars[variable] = value
 439
 440         # Unquote values.
 441         for var, val in emacs_vars.items():
 442             if len(val) > 1 and (val.startswith('"') and val.endswith('"')
 443                or val.startswith('"') and val.endswith('"')):
 444                 emacs_vars[var] = val[1:-1]
 445
 446         return emacs_vars
 447
 448     # Cribbed from a post by Bart Lateur:
 449     # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
 450     _detab_re = re.compile(r'(.*?)\t', re.M)
 451     def _detab_sub(self, match):
 452         g1 = match.group(1)
 453         return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
 454     def _detab(self, text):
 455         r"""Remove (leading?) tabs from a file.
 456
 457             >>> m = Markdown()
 458             >>> m._detab("\tfoo")
 459             '    foo'
 460             >>> m._detab("  \tfoo")
 461             '    foo'
 462             >>> m._detab("\t  foo")
 463             '      foo'
 464             >>> m._detab("  foo")
 465             '  foo'
 466             >>> m._detab("  foo\n\tbar\tblam")
 467             '  foo\n    bar blam'
 468         """
 469         if '\t' not in text:
 470             return text
 471         return self._detab_re.subn(self._detab_sub, text)[0]
 472
 473     # I broke out the html5 tags here and add them to _block_tags_a and
 474     # _block_tags_b.  This way html5 tags are easy to keep track of.
 475     _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
 476
 477     _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
 478     _block_tags_a += _html5tags
 479
 480     _strict_tag_block_re = re.compile(r"""
 481         (                       # save in \1
 482             ^                   # start of line  (with re.M)
 483             <(%s)               # start tag = \2
 484             \b                  # word break
 485             (.*\n)*?            # any number of lines, minimally matching
 486             </\2>               # the matching end tag
 487             [ \t]*              # trailing spaces/tabs
 488             (?=\n+|\Z)          # followed by a newline or end of document
 489         )
 490         """ % _block_tags_a,
 491         re.X | re.M)
 492
 493     _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
 494     _block_tags_b += _html5tags
 495
 496     _liberal_tag_block_re = re.compile(r"""
 497         (                       # save in \1
 498             ^                   # start of line  (with re.M)
 499             <(%s)               # start tag = \2
 500             \b                  # word break
 501             (.*\n)*?            # any number of lines, minimally matching
 502             .*</\2>             # the matching end tag
 503             [ \t]*              # trailing spaces/tabs
 504             (?=\n+|\Z)          # followed by a newline or end of document
 505         )
 506         """ % _block_tags_b,
 507         re.X | re.M)
 508
 509     _html_markdown_attr_re = re.compile(
 510         r'''\s+markdown=("1"|'1')''')
 511     def _hash_html_block_sub(self, match, raw=False):
 512         html = match.group(1)
 513         if raw and self.safe_mode:
 514             html = self._sanitize_html(html)
 515         elif 'markdown-in-html' in self.extras and 'markdown=' in html:
 516             first_line = html.split('\n', 1)[0]
 517             m = self._html_markdown_attr_re.search(first_line)
 518             if m:
 519                 lines = html.split('\n')
 520                 middle = '\n'.join(lines[1:-1])
 521                 last_line = lines[-1]
 522                 first_line = first_line[:m.start()] + first_line[m.end():]
 523                 f_key = _hash_text(first_line)
 524                 self.html_blocks[f_key] = first_line
 525                 l_key = _hash_text(last_line)
 526                 self.html_blocks[l_key] = last_line
 527                 return ''.join(["\n\n", f_key,
 528                     "\n\n", middle, "\n\n",
 529                     l_key, "\n\n"])
 530         key = _hash_text(html)
 531         self.html_blocks[key] = html
 532         return "\n\n" + key + "\n\n"
 533
 534     def _hash_html_blocks(self, text, raw=False):
 535         """Hashify HTML blocks
 536
 537         We only want to do this for block-level HTML tags, such as headers,
 538         lists, and tables. That's because we still want to wrap <p>s around
 539         "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 540         phrase emphasis, and spans. The list of tags we're looking for is
 541         hard-coded.
 542
 543         @param raw {boolean} indicates if these are raw HTML blocks in
 544             the original source. It makes a difference in "safe" mode.
 545         """
 546         if '<' not in text:
 547             return text
 548
 549         # Pass `raw` value into our calls to self._hash_html_block_sub.
 550         hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
 551
 552         # First, look for nested blocks, e.g.:
 553         #   <div>
 554         #       <div>
 555         #       tags for inner block must be indented.
 556         #       </div>
 557         #   </div>
 558         #
 559         # The outermost tags must start at the left margin for this to match, and
 560         # the inner nested divs must be indented.
 561         # We need to do this before the next, more liberal match, because the next
 562         # match will start at the first `<div>` and stop at the first `</div>`.
 563         text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
 564
 565         # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 566         text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
 567
 568         # Special case just for <hr />. It was easier to make a special
 569         # case than to make the other regex more complicated.
 570         if "<hr" in text:
 571             _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
 572             text = _hr_tag_re.sub(hash_html_block_sub, text)
 573
 574         # Special case for standalone HTML comments:
 575         if "<!--" in text:
 576             start = 0
 577             while True:
 578                 # Delimiters for next comment block.
 579                 try:
 580                     start_idx = text.index("<!--", start)
 581                 except ValueError as ex:
 582                     break
 583                 try:
 584                     end_idx = text.index("-->", start_idx) + 3
 585                 except ValueError as ex:
 586                     break
 587
 588                 # Start position for next comment block search.
 589                 start = end_idx
 590
 591                 # Validate whitespace before comment.
 592                 if start_idx:
 593                     # - Up to `tab_width - 1` spaces before start_idx.
 594                     for i in range(self.tab_width - 1):
 595                         if text[start_idx - 1] != ' ':
 596                             break
 597                         start_idx -= 1
 598                         if start_idx == 0:
 599                             break
 600                     # - Must be preceded by 2 newlines or hit the start of
 601                     #   the document.
 602                     if start_idx == 0:
 603                         pass
 604                     elif start_idx == 1 and text[0] == '\n':
 605                         start_idx = 0  # to match minute detail of Markdown.pl regex
 606                     elif text[start_idx-2:start_idx] == '\n\n':
 607                         pass
 608                     else:
 609                         break
 610
 611                 # Validate whitespace after comment.
 612                 # - Any number of spaces and tabs.
 613                 while end_idx < len(text):
 614                     if text[end_idx] not in ' \t':
 615                         break
 616                     end_idx += 1
 617                 # - Must be following by 2 newlines or hit end of text.
 618                 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
 619                     continue
 620
 621                 # Escape and hash (must match `_hash_html_block_sub`).
 622                 html = text[start_idx:end_idx]
 623                 if raw and self.safe_mode:
 624                     html = self._sanitize_html(html)
 625                 key = _hash_text(html)
 626                 self.html_blocks[key] = html
 627                 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
 628
 629         if "xml" in self.extras:
 630             # Treat XML processing instructions and namespaced one-liner
 631             # tags as if they were block HTML tags. E.g., if standalone
 632             # (i.e. are their own paragraph), the following do not get
 633             # wrapped in a <p> tag:
 634             #    <?foo bar?>
 635             #
 636             #    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
 637             _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
 638             text = _xml_oneliner_re.sub(hash_html_block_sub, text)
 639
 640         return text
 641
 642     def _strip_link_definitions(self, text):
 643         # Strips link definitions from text, stores the URLs and titles in
 644         # hash references.
 645         less_than_tab = self.tab_width - 1
 646
 647         # Link defs are in the form:
 648         #   [id]: url "optional title"
 649         _link_def_re = re.compile(r"""
 650             ^[ ]{0,%d}\[(.+)\]: # id = \1
 651               [ \t]*
 652               \n?               # maybe *one* newline
 653               [ \t]*
 654             <?(.+?)>?           # url = \2
 655               [ \t]*
 656             (?:
 657                 \n?             # maybe one newline
 658                 [ \t]*
 659                 (?<=\s)         # lookbehind for whitespace
 660                 ['"(]
 661                 ([^\n]*)        # title = \3
 662                 ['")]
 663                 [ \t]*
 664             )?  # title is optional
 665             (?:\n+|\Z)
 666             """ % less_than_tab, re.X | re.M | re.U)
 667         return _link_def_re.sub(self._extract_link_def_sub, text)
 668
 669     def _extract_link_def_sub(self, match):
 670         id, url, title = match.groups()
 671         key = id.lower()    # Link IDs are case-insensitive
 672         self.urls[key] = self._encode_amps_and_angles(url)
 673         if title:
 674             self.titles[key] = title
 675         return ""
 676
 677     def _extract_footnote_def_sub(self, match):
 678         id, text = match.groups()
 679         text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
 680         normed_id = re.sub(r'\W', '-', id)
 681         # Ensure footnote text ends with a couple newlines (for some
 682         # block gamut matches).
 683         self.footnotes[normed_id] = text + "\n\n"
 684         return ""
 685
 686     def _strip_footnote_definitions(self, text):
 687         """A footnote definition looks like this:
 688
 689             [^note-id]: Text of the note.
 690
 691                 May include one or more indented paragraphs.
 692
 693         Where,
 694         - The 'note-id' can be pretty much anything, though typically it
 695           is the number of the footnote.
 696         - The first paragraph may start on the next line, like so:
 697
 698             [^note-id]:
 699                 Text of the note.
 700         """
 701         less_than_tab = self.tab_width - 1
 702         footnote_def_re = re.compile(r'''
 703             ^[ ]{0,%d}\[\^(.+)\]:   # id = \1
 704             [ \t]*
 705             (                       # footnote text = \2
 706               # First line need not start with the spaces.
 707               (?:\s*.*\n+)
 708               (?:
 709                 (?:[ ]{%d} | \t)  # Subsequent lines must be indented.
 710                 .*\n+
 711               )*
 712             )
 713             # Lookahead for non-space at line-start, or end of doc.
 714             (?:(?=^[ ]{0,%d}\S)|\Z)
 715             ''' % (less_than_tab, self.tab_width, self.tab_width),
 716             re.X | re.M)
 717         return footnote_def_re.sub(self._extract_footnote_def_sub, text)
 718
 719
 720     _hr_data = [
 721         ('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
 722         ('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
 723         ('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
 724     ]
 725
 726     def _run_block_gamut(self, text):
 727         # These are all the transformations that form block-level
 728         # tags like paragraphs, headers, and list items.
 729
 730         text = self._do_headers(text)
 731
 732         # Do Horizontal Rules:
 733         # On the number of spaces in horizontal rules: The spec is fuzzy: "If
 734         # you wish, you may use spaces between the hyphens or asterisks."
 735         # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
 736         # hr chars to one or two. We'll reproduce that limit here.
 737         hr = "\n<hr"+self.empty_element_suffix+"\n"
 738         for ch, regex in self._hr_data:
 739             if ch in text:
 740                 for m in reversed(list(regex.finditer(text))):
 741                     tail = m.group(1).rstrip()
 742                     if not tail.strip(ch + ' ') and tail.count("   ") == 0:
 743                         start, end = m.span()
 744                         text = text[:start] + hr + text[end:]
 745
 746         text = self._do_lists(text)
 747
 748         if "pyshell" in self.extras:
 749             text = self._prepare_pyshell_blocks(text)
 750         if "wiki-tables" in self.extras:
 751             text = self._do_wiki_tables(text)
 752
 753         text = self._do_code_blocks(text)
 754
 755         text = self._do_block_quotes(text)
 756
 757         # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 758         # was to escape raw HTML in the original Markdown source. This time,
 759         # we're escaping the markup we've just created, so that we don't wrap
 760         # <p> tags around block-level tags.
 761         text = self._hash_html_blocks(text)
 762
 763         text = self._form_paragraphs(text)
 764
 765         return text
 766
 767     def _pyshell_block_sub(self, match):
 768         lines = match.group(0).splitlines(0)
 769         _dedentlines(lines)
 770         indent = ' ' * self.tab_width
 771         s = ('\n' # separate from possible cuddled paragraph
 772              + indent + ('\n'+indent).join(lines)
 773              + '\n\n')
 774         return s
 775
 776     def _prepare_pyshell_blocks(self, text):
 777         """Ensure that Python interactive shell sessions are put in
 778         code blocks -- even if not properly indented.
 779         """
 780         if ">>>" not in text:
 781             return text
 782
 783         less_than_tab = self.tab_width - 1
 784         _pyshell_block_re = re.compile(r"""
 785             ^([ ]{0,%d})>>>[ ].*\n   # first line
 786             ^(\1.*\S+.*\n)*         # any number of subsequent lines
 787             ^\n                     # ends with a blank line
 788             """ % less_than_tab, re.M | re.X)
 789
 790         return _pyshell_block_re.sub(self._pyshell_block_sub, text)
 791
 792     def _wiki_table_sub(self, match):
 793         ttext = match.group(0).strip()
 794         #print 'wiki table: %r' % match.group(0)
 795         rows = []
 796         for line in ttext.splitlines(0):
 797             line = line.strip()[2:-2].strip()
 798             row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
 799             rows.append(row)
 800         #pprint(rows)
 801         hlines = ['<table>', '<tbody>']
 802         for row in rows:
 803             hrow = ['<tr>']
 804             for cell in row:
 805                 hrow.append('<td>')
 806                 hrow.append(self._run_span_gamut(cell))
 807                 hrow.append('</td>')
 808             hrow.append('</tr>')
 809             hlines.append(''.join(hrow))
 810         hlines += ['</tbody>', '</table>']
 811         return '\n'.join(hlines) + '\n'
 812
 813     def _do_wiki_tables(self, text):
 814         # Optimization.
 815         if "||" not in text:
 816             return text
 817
 818         less_than_tab = self.tab_width - 1
 819         wiki_table_re = re.compile(r'''
 820             (?:(?<=\n\n)|\A\n?)            # leading blank line
 821             ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n  # first line
 822             (^\1\|\|.+?\|\|\n)*        # any number of subsequent lines
 823             ''' % less_than_tab, re.M | re.X)
 824         return wiki_table_re.sub(self._wiki_table_sub, text)
 825
 826     def _run_span_gamut(self, text):
 827         # These are all the transformations that occur *within* block-level
 828         # tags like paragraphs, headers, and list items.
 829
 830         text = self._do_code_spans(text)
 831
 832         text = self._escape_special_chars(text)
 833
 834         # Process anchor and image tags.
 835         text = self._do_links(text)
 836
 837         # Make links out of things like `<http://example.com/>`
 838         # Must come after _do_links(), because you can use < and >
 839         # delimiters in inline links like [this](<url>).
 840         text = self._do_auto_links(text)
 841
 842         if "link-patterns" in self.extras:
 843             text = self._do_link_patterns(text)
 844
 845         text = self._encode_amps_and_angles(text)
 846
 847         text = self._do_italics_bold_underline_mono(text)
 848
 849         if "smarty-pants" in self.extras:
 850             text = self._do_smart_punctuation(text)
 851
 852         # Do hard breaks:
 853         text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
 854
 855         return text
 856
 857     # "Sorta" because auto-links are identified as "tag" tokens.
 858     _sorta_html_tokenize_re = re.compile(r"""
 859         (
 860             # tag
 861             </?
 862             (?:\w+)                                     # tag name
 863             (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))*  # attributes
 864             \s*/?>
 865             |
 866             # auto-link (e.g., <http://www.activestate.com/>)
 867             <\w+[^>]*>
 868             |
 869             <!--.*?-->      # comment
 870             |
 871             <\?.*?\?>       # processing instruction
 872         )
 873         """, re.X)
 874
 875     def _escape_special_chars(self, text):
 876         # Python markdown note: the HTML tokenization here differs from
 877         # that in Markdown.pl, hence the behaviour for subtle cases can
 878         # differ (I believe the tokenizer here does a better job because
 879         # it isn't susceptible to unmatched '<' and '>' in HTML tags).
 880         # Note, however, that '>' is not allowed in an auto-link URL
 881         # here.
 882         escaped = []
 883         is_html_markup = False
 884         for token in self._sorta_html_tokenize_re.split(text):
 885             if is_html_markup:
 886                 # Within tags/HTML-comments/auto-links, encode * and _
 887                 # so they don't conflict with their use in Markdown for
 888                 # italics and strong.  We're replacing each such
 889                 # character with its corresponding MD5 checksum value;
 890                 # this is likely overkill, but it should prevent us from
 891                 # colliding with the escape values by accident.
 892                 escaped.append(token.replace('*', self._escape_table['*'])
 893                                     .replace('_', self._escape_table['_']))
 894             else:
 895                 escaped.append(self._encode_backslash_escapes(token))
 896             is_html_markup = not is_html_markup
 897         return ''.join(escaped)
 898
 899     def _hash_html_spans(self, text):
 900         # Used for safe_mode.
 901
 902         def _is_auto_link(s):
 903             if ':' in s and self._auto_link_re.match(s):
 904                 return True
 905             elif '@' in s and self._auto_email_link_re.match(s):
 906                 return True
 907             return False
 908
 909         tokens = []
 910         is_html_markup = False
 911         for token in self._sorta_html_tokenize_re.split(text):
 912             if is_html_markup and not _is_auto_link(token):
 913                 sanitized = self._sanitize_html(token)
 914                 key = _hash_text(sanitized)
 915                 self.html_spans[key] = sanitized
 916                 tokens.append(key)
 917             else:
 918                 tokens.append(token)
 919             is_html_markup = not is_html_markup
 920         return ''.join(tokens)
 921
 922     def _unhash_html_spans(self, text):
 923         for key, sanitized in self.html_spans.items():
 924             text = text.replace(key, sanitized)
 925         return text
 926
 927     def _sanitize_html(self, s):
 928         if self.safe_mode == "replace":
 929             return self.html_removed_text
 930         elif self.safe_mode == "escape":
 931             replacements = [
 932                 ('&', '&amp;'),
 933                 ('<', '&lt;'),
 934                 ('>', '&gt;'),
 935             ]
 936             for before, after in replacements:
 937                 s = s.replace(before, after)
 938             return s
 939         else:
 940             raise MarkdownError("invalid value for 'safe_mode': %r (must be "
 941                                 "'escape' or 'replace')" % self.safe_mode)
 942
 943     _tail_of_inline_link_re = re.compile(r'''
 944           # Match tail of: [text](/url/) or [text](/url/ "title")
 945           \(            # literal paren
 946             [ \t]*
 947             (?P<url>            # \1
 948                 <.*?>
 949                 |
 950                 .*?
 951             )
 952             [ \t]*
 953             (                   # \2
 954               (['"])            # quote char = \3
 955               (?P<title>.*?)
 956               \3                # matching quote
 957             )?                  # title is optional
 958           \)
 959         ''', re.X | re.S)
 960     _tail_of_reference_link_re = re.compile(r'''
 961           # Match tail of: [text][id]
 962           [ ]?          # one optional space
 963           (?:\n[ ]*)?   # one optional newline followed by spaces
 964           \[
 965             (?P<id>.*?)
 966           \]
 967         ''', re.X | re.S)
 968
 969     def _do_links(self, text):
 970         """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
 971
 972         This is a combination of Markdown.pl's _DoAnchors() and
 973         _DoImages(). They are done together because that simplified the
 974         approach. It was necessary to use a different approach than
 975         Markdown.pl because of the lack of atomic matching support in
 976         Python's regex engine used in $g_nested_brackets.
 977         """
 978         MAX_LINK_TEXT_SENTINEL = 3000  # markdown2 issue 24
 979
 980         # `anchor_allowed_pos` is used to support img links inside
 981         # anchors, but not anchors inside anchors. An anchor's start
 982         # pos must be `>= anchor_allowed_pos`.
 983         anchor_allowed_pos = 0
 984
 985         curr_pos = 0
 986         while True: # Handle the next link.
 987             # The next '[' is the start of:
 988             # - an inline anchor:   [text](url "title")
 989             # - a reference anchor: [text][id]
 990             # - an inline img:      ![text](url "title")
 991             # - a reference img:    ![text][id]
 992             # - a footnote ref:     [^id]
 993             #   (Only if 'footnotes' extra enabled)
 994             # - a footnote defn:    [^id]: ...
 995             #   (Only if 'footnotes' extra enabled) These have already
 996             #   been stripped in _strip_footnote_definitions() so no
 997             #   need to watch for them.
 998             # - a link definition:  [id]: url "title"
 999             #   These have already been stripped in
1000             #   _strip_link_definitions() so no need to watch for them.
1001             # - not markup:         [...anything else...
1002             try:
1003                 start_idx = text.index('[', curr_pos)
1004             except ValueError:
1005                 break
1006             text_length = len(text)
1007
1008             # Find the matching closing ']'.
1009             # Markdown.pl allows *matching* brackets in link text so we
1010             # will here too. Markdown.pl *doesn't* currently allow
1011             # matching brackets in img alt text -- we'll differ in that
1012             # regard.
1013             bracket_depth = 0
1014             for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1015                                             text_length)):
1016                 ch = text[p]
1017                 if ch == ']':
1018                     bracket_depth -= 1
1019                     if bracket_depth < 0:
1020                         break
1021                 elif ch == '[':
1022                     bracket_depth += 1
1023             else:
1024                 # Closing bracket not found within sentinel length.
1025                 # This isn't markup.
1026                 curr_pos = start_idx + 1
1027                 continue
1028             link_text = text[start_idx+1:p]
1029
1030             # Possibly a footnote ref?
1031             if "footnotes" in self.extras and link_text.startswith("^"):
1032                 normed_id = re.sub(r'\W', '-', link_text[1:])
1033                 if normed_id in self.footnotes:
1034                     self.footnote_ids.append(normed_id)
1035                     result = '<sup class="footnote-ref" id="fnref-%s">' \
1036                              '<a href="#fn-%s">%s</a></sup>' \
1037                              % (normed_id, normed_id, len(self.footnote_ids))
1038                     text = text[:start_idx] + result + text[p+1:]
1039                 else:
1040                     # This id isn't defined, leave the markup alone.
1041                     curr_pos = p+1
1042                 continue
1043
1044             # Now determine what this is by the remainder.
1045             p += 1
1046             if p == text_length:
1047                 return text
1048
1049             # Inline anchor or img?
1050             if text[p] == '(': # attempt at perf improvement
1051                 match = self._tail_of_inline_link_re.match(text, p)
1052                 if match:
1053                     # Handle an inline anchor or img.
1054                     is_img = start_idx > 0 and text[start_idx-1] == "!"
1055                     if is_img:
1056                         start_idx -= 1
1057
1058                     url, title = match.group("url"), match.group("title")
1059                     if url and url[0] == '<':
1060                         url = url[1:-1]  # '<url>' -> 'url'
1061                     # We've got to encode these to avoid conflicting
1062                     # with italics/bold.
1063                     url = url.replace('*', self._escape_table['*']) \
1064                              .replace('_', self._escape_table['_'])
1065                     if title:
1066                         title_str = ' title="%s"' % (
1067                             _xml_escape_attr(title)
1068                                 .replace('*', self._escape_table['*'])
1069                                 .replace('_', self._escape_table['_']))
1070                     else:
1071                         title_str = ''
1072                     if is_img:
1073                         result = '<img src="%s" alt="%s"%s%s' \
1074                             % (url.replace('"', '&quot;'),
1075                                _xml_escape_attr(link_text),
1076                                title_str, self.empty_element_suffix)
1077                         curr_pos = start_idx + len(result)
1078                         text = text[:start_idx] + result + text[match.end():]
1079                     elif start_idx >= anchor_allowed_pos:
1080                         result_head = '<a href="%s"%s>' % (url, title_str)
1081                         result = '%s%s</a>' % (result_head, link_text)
1082                         # <img> allowed from curr_pos on, <a> from
1083                         # anchor_allowed_pos on.
1084                         curr_pos = start_idx + len(result_head)
1085                         anchor_allowed_pos = start_idx + len(result)
1086                         text = text[:start_idx] + result + text[match.end():]
1087                     else:
1088                         # Anchor not allowed here.
1089                         curr_pos = start_idx + 1
1090                     continue
1091
1092             # Reference anchor or img?
1093             else:
1094                 match = self._tail_of_reference_link_re.match(text, p)
1095                 if match:
1096                     # Handle a reference-style anchor or img.
1097                     is_img = start_idx > 0 and text[start_idx-1] == "!"
1098                     if is_img:
1099                         start_idx -= 1
1100                     link_id = match.group("id").lower()
1101                     if not link_id:
1102                         link_id = link_text.lower()  # for links like [this][]
1103                     if link_id in self.urls:
1104                         url = self.urls[link_id]
1105                         # We've got to encode these to avoid conflicting
1106                         # with italics/bold.
1107                         url = url.replace('*', self._escape_table['*']) \
1108                                  .replace('_', self._escape_table['_'])
1109                         title = self.titles.get(link_id)
1110                         if title:
1111                             before = title
1112                             title = _xml_escape_attr(title) \
1113                                 .replace('*', self._escape_table['*']) \
1114                                 .replace('_', self._escape_table['_'])
1115                             title_str = ' title="%s"' % title
1116                         else:
1117                             title_str = ''
1118                         if is_img:
1119                             result = '<img src="%s" alt="%s"%s%s' \
1120                                 % (url.replace('"', '&quot;'),
1121                                    link_text.replace('"', '&quot;'),
1122                                    title_str, self.empty_element_suffix)
1123                             curr_pos = start_idx + len(result)
1124                             text = text[:start_idx] + result + text[match.end():]
1125                         elif start_idx >= anchor_allowed_pos:
1126                             result = '<a href="%s"%s>%s</a>' \
1127                                 % (url, title_str, link_text)
1128                             result_head = '<a href="%s"%s>' % (url, title_str)
1129                             result = '%s%s</a>' % (result_head, link_text)
1130                             # <img> allowed from curr_pos on, <a> from
1131                             # anchor_allowed_pos on.
1132                             curr_pos = start_idx + len(result_head)
1133                             anchor_allowed_pos = start_idx + len(result)
1134                             text = text[:start_idx] + result + text[match.end():]
1135                         else:
1136                             # Anchor not allowed here.
1137                             curr_pos = start_idx + 1
1138                     else:
1139                         # This id isn't defined, leave the markup alone.
1140                         curr_pos = match.end()
1141                     continue
1142
1143             # Otherwise, it isn't markup.
1144             curr_pos = start_idx + 1
1145
1146         return text
1147
1148     def header_id_from_text(self, text, prefix, n):
1149         """Generate a header id attribute value from the given header
1150         HTML content.
1151
1152         This is only called if the "header-ids" extra is enabled.
1153         Subclasses may override this for different header ids.
1154
1155         @param text {str} The text of the header tag
1156         @param prefix {str} The requested prefix for header ids. This is the
1157             value of the "header-ids" extra key, if any. Otherwise, None.
1158         @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1159         @returns {str} The value for the header tag's "id" attribute. Return
1160             None to not have an id attribute and to exclude this header from
1161             the TOC (if the "toc" extra is specified).
1162         """
1163         header_id = _slugify(text)
1164         if prefix and isinstance(prefix, basestring):
1165             header_id = prefix + '-' + header_id
1166         if header_id in self._count_from_header_id:
1167             self._count_from_header_id[header_id] += 1
1168             header_id += '-%s' % self._count_from_header_id[header_id]
1169         else:
1170             self._count_from_header_id[header_id] = 1
1171         return header_id
1172
1173     _toc = None
1174     def _toc_add_entry(self, level, id, name):
1175         if self._toc is None:
1176             self._toc = []
1177         self._toc.append((level, id, name))
1178
1179     _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1180     def _setext_h_sub(self, match):
1181         n = {"=": 1, "-": 2}[match.group(2)[0]]
1182         demote_headers = self.extras.get("demote-headers")
1183         if demote_headers:
1184             n = min(n + demote_headers, 6)
1185         header_id_attr = ""
1186         if "header-ids" in self.extras:
1187             header_id = self.header_id_from_text(match.group(1),
1188                 self.extras["header-ids"], n)
1189             if header_id:
1190                 header_id_attr = ' id="%s"' % header_id
1191         html = self._run_span_gamut(match.group(1))
1192         if "toc" in self.extras and header_id:
1193             self._toc_add_entry(n, header_id, html)
1194         return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1195
1196     _atx_h_re = re.compile(r'''
1197         ^(\={1,6})  # \1 = string of ='s
1198         [ \t]*
1199         (.+?)       # \2 = Header text
1200         [ \t]*
1201         (?<!\\)     # ensure not an escaped trailing '#'
1202         \=*         # optional closing #'s (not counted)
1203         \n+
1204         ''', re.X | re.M)
1205     def _atx_h_sub(self, match):
1206         n = len(match.group(1))
1207         demote_headers = self.extras.get("demote-headers")
1208         if demote_headers:
1209             n = min(n + demote_headers, 6)
1210         header_id_attr = ""
1211         if "header-ids" in self.extras:
1212             header_id = self.header_id_from_text(match.group(2),
1213                 self.extras["header-ids"], n)
1214             if header_id:
1215                 header_id_attr = ' id="%s"' % header_id
1216         html = self._run_span_gamut(match.group(2))
1217         if "toc" in self.extras and header_id:
1218             self._toc_add_entry(n, header_id, html)
1219         return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1220
1221     def _do_headers(self, text):
1222         # Setext-style headers:
1223         #     Header 1
1224         #     ========
1225         #
1226         #     Header 2
1227         #     --------
1228 # zarvox: nuke this nonsense
1229 #        text = self._setext_h_re.sub(self._setext_h_sub, text)
1230
1231         # atx-style headers:
1232         #   # Header 1
1233         #   ## Header 2
1234         #   ## Header 2 with closing hashes ##
1235         #   ...
1236         #   ###### Header 6
1237         text = self._atx_h_re.sub(self._atx_h_sub, text)
1238
1239         return text
1240
1241
1242     _marker_ul_chars  = '*'
1243     _marker_ol_chars  = '-#'
1244     _marker_any = r'(?:[%s]|[%s])' % (_marker_ul_chars, _marker_ol_chars)
1245     _marker_ul = '(?:[%s])' % _marker_ul_chars
1246     _marker_ol = r'(?:[%s])' % _marker_ol_chars
1247
1248     def _list_sub(self, match):
1249         lst = match.group(1)
1250         lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1251         result = self._process_list_items(lst)
1252         if self.list_level:
1253             return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1254         else:
1255             return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1256
1257     def _do_lists(self, text):
1258         # Form HTML ordered (numbered) and unordered (bulleted) lists.
1259
1260         for marker_pat in (self._marker_ul, self._marker_ol):
1261             # Re-usable pattern to match any entire ul or ol list:
1262             less_than_tab = self.tab_width - 1
1263             whole_list = r'''
1264                 (                   # \1 = whole list
1265                   (                 # \2
1266                     [ ]{0,%d}
1267                     (%s)            # \3 = first list item marker
1268                     [ \t]+
1269                     (?!\ *\3\ )     # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1270                   )
1271                   (?:.+?)
1272                   (                 # \4
1273                       \Z
1274                     |
1275                       \n{2,}
1276                       (?=\S)
1277                       (?!           # Negative lookahead for another list item marker
1278                         [ \t]*
1279                         %s[ \t]+
1280                       )
1281                   )
1282                 )
1283             ''' % (less_than_tab, marker_pat, marker_pat)
1284
1285             # We use a different prefix before nested lists than top-level lists.
1286             # See extended comment in _process_list_items().
1287             #
1288             # Note: There's a bit of duplication here. My original implementation
1289             # created a scalar regex pattern as the conditional result of the test on
1290             # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1291             # substitution once, using the scalar as the pattern. This worked,
1292             # everywhere except when running under MT on my hosting account at Pair
1293             # Networks. There, this caused all rebuilds to be killed by the reaper (or
1294             # perhaps they crashed, but that seems incredibly unlikely given that the
1295             # same script on the same server ran fine *except* under MT. I've spent
1296             # more time trying to figure out why this is happening than I'd like to
1297             # admit. My only guess, backed up by the fact that this workaround works,
1298             # is that Perl optimizes the substition when it can figure out that the
1299             # pattern will never change, and when this optimization isn't on, we run
1300             # afoul of the reaper. Thus, the slightly redundant code to that uses two
1301             # static s/// patterns rather than one conditional pattern.
1302
1303             if self.list_level:
1304                 sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1305                 text = sub_list_re.sub(self._list_sub, text)
1306             else:
1307                 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1308                                      re.X | re.M | re.S)
1309                 text = list_re.sub(self._list_sub, text)
1310
1311         return text
1312
1313     _list_item_re = re.compile(r'''
1314         (\n)?                   # leading line = \1
1315         (^[ \t]*)               # leading whitespace = \2
1316         (?P<marker>%s) [ \t]+   # list marker = \3
1317         ((?:.+?)                # list item text = \4
1318          (\n{1,2}))             # eols = \5
1319         (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
1320         ''' % (_marker_any, _marker_any),
1321         re.M | re.X | re.S)
1322
1323     _last_li_endswith_two_eols = False
1324     def _list_item_sub(self, match):
1325         item = match.group(4)
1326         leading_line = match.group(1)
1327         leading_space = match.group(2)
1328         if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1329             item = self._run_block_gamut(self._outdent(item))
1330         else:
1331             # Recursion for sub-lists:
1332             item = self._do_lists(self._outdent(item))
1333             if item.endswith('\n'):
1334                 item = item[:-1]
1335             item = self._run_span_gamut(item)
1336         self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1337         return "<li>%s</li>\n" % item
1338
1339     def _process_list_items(self, list_str):
1340         # Process the contents of a single ordered or unordered list,
1341         # splitting it into individual list items.
1342
1343         # The $g_list_level global keeps track of when we're inside a list.
1344         # Each time we enter a list, we increment it; when we leave a list,
1345         # we decrement. If it's zero, we're not in a list anymore.
1346         #
1347         # We do this because when we're not inside a list, we want to treat
1348         # something like this:
1349         #
1350         #       I recommend upgrading to version
1351         #       8. Oops, now this line is treated
1352         #       as a sub-list.
1353         #
1354         # As a single paragraph, despite the fact that the second line starts
1355         # with a digit-period-space sequence.
1356         #
1357         # Whereas when we're inside a list (or sub-list), that line will be
1358         # treated as the start of a sub-list. What a kludge, huh? This is
1359         # an aspect of Markdown's syntax that's hard to parse perfectly
1360         # without resorting to mind-reading. Perhaps the solution is to
1361         # change the syntax rules such that sub-lists must start with a
1362         # starting cardinal number; e.g. "1." or "a.".
1363         self.list_level += 1
1364         self._last_li_endswith_two_eols = False
1365         list_str = list_str.rstrip('\n') + '\n'
1366         list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1367         self.list_level -= 1
1368         return list_str
1369
1370     def _get_pygments_lexer(self, lexer_name):
1371         try:
1372             from pygments import lexers, util
1373         except ImportError:
1374             return None
1375         try:
1376             return lexers.get_lexer_by_name(lexer_name)
1377         except util.ClassNotFound:
1378             return None
1379
1380     def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1381         import pygments
1382         import pygments.formatters
1383
1384         class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1385             def _wrap_code(self, inner):
1386                 """A function for use in a Pygments Formatter which
1387                 wraps in <code> tags.
1388                 """
1389                 yield 0, "<code>"
1390                 for tup in inner:
1391                     yield tup
1392                 yield 0, "</code>"
1393
1394             def wrap(self, source, outfile):
1395                 """Return the source with a code, pre, and div."""
1396                 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1397
1398         formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts)
1399         return pygments.highlight(codeblock, lexer, formatter)
1400
1401     def _code_block_sub(self, match):
1402         codeblock = match.group(1)
1403         codeblock = self._outdent(codeblock)
1404         codeblock = self._detab(codeblock)
1405         codeblock = codeblock.lstrip('\n')  # trim leading newlines
1406         codeblock = codeblock.rstrip()      # trim trailing whitespace
1407
1408         if "code-color" in self.extras and codeblock.startswith(":::"):
1409             lexer_name, rest = codeblock.split('\n', 1)
1410             lexer_name = lexer_name[3:].strip()
1411             lexer = self._get_pygments_lexer(lexer_name)
1412             codeblock = rest.lstrip("\n")   # Remove lexer declaration line.
1413             if lexer:
1414                 formatter_opts = self.extras['code-color'] or {}
1415                 colored = self._color_with_pygments(codeblock, lexer,
1416                                                     **formatter_opts)
1417                 return "\n\n%s\n\n" % colored
1418
1419         codeblock = self._encode_code(codeblock)
1420         pre_class_str = self._html_class_str_from_tag("pre")
1421         code_class_str = self._html_class_str_from_tag("code")
1422         return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
1423             pre_class_str, code_class_str, codeblock)
1424
1425     def _html_class_str_from_tag(self, tag):
1426         """Get the appropriate ' class="..."' string (note the leading
1427         space), if any, for the given tag.
1428         """
1429         if "html-classes" not in self.extras:
1430             return ""
1431         try:
1432             html_classes_from_tag = self.extras["html-classes"]
1433         except TypeError:
1434             return ""
1435         else:
1436             if tag in html_classes_from_tag:
1437                 return ' class="%s"' % html_classes_from_tag[tag]
1438         return ""
1439
1440     def _do_code_blocks(self, text):
1441         """Process Markdown `<pre><code>` blocks."""
1442         code_block_re = re.compile(r'''
1443             (?:\n\n|\A)
1444             (               # $1 = the code block -- one or more lines, starting with a space/tab
1445               (?:
1446                 (?:[ ]{%d} | \t)  # Lines must start with a tab or a tab-width of spaces
1447                 .*\n+
1448               )+
1449             )
1450             ((?=^[ ]{0,%d}\S)|\Z)   # Lookahead for non-space at line-start, or end of doc
1451             ''' % (self.tab_width, self.tab_width),
1452             re.M | re.X)
1453
1454         return code_block_re.sub(self._code_block_sub, text)
1455
1456
1457     # Rules for a code span:
1458     # - backslash escapes are not interpreted in a code span
1459     # - to include one or or a run of more backticks the delimiters must
1460     #   be a longer run of backticks
1461     # - cannot start or end a code span with a backtick; pad with a
1462     #   space and that space will be removed in the emitted HTML
1463     # See `test/tm-cases/escapes.text` for a number of edge-case
1464     # examples.
1465     _code_span_re = re.compile(r'''
1466             (?<!\\)
1467             (`+)        # \1 = Opening run of `
1468             (?!`)       # See Note A test/tm-cases/escapes.text
1469             (.+?)       # \2 = The code block
1470             (?<!`)
1471             \1          # Matching closer
1472             (?!`)
1473         ''', re.X | re.S)
1474
1475     def _code_span_sub(self, match):
1476         c = match.group(2).strip(" \t")
1477         c = self._encode_code(c)
1478         return "<code>%s</code>" % c
1479
1480     def _do_code_spans(self, text):
1481         #   *   Backtick quotes are used for <code></code> spans.
1482         #
1483         #   *   You can use multiple backticks as the delimiters if you want to
1484         #       include literal backticks in the code span. So, this input:
1485         #
1486         #         Just type ``foo `bar` baz`` at the prompt.
1487         #
1488         #       Will translate to:
1489         #
1490         #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1491         #
1492         #       There's no arbitrary limit to the number of backticks you
1493         #       can use as delimters. If you need three consecutive backticks
1494         #       in your code, use four for delimiters, etc.
1495         #
1496         #   *   You can use spaces to get literal backticks at the edges:
1497         #
1498         #         ... type `` `bar` `` ...
1499         #
1500         #       Turns to:
1501         #
1502         #         ... type <code>`bar`</code> ...
1503         return self._code_span_re.sub(self._code_span_sub, text)
1504
1505     def _encode_code(self, text):
1506         """Encode/escape certain characters inside Markdown code runs.
1507         The point is that in code, these characters are literals,
1508         and lose their special Markdown meanings.
1509         """
1510         replacements = [
1511             # Encode all ampersands; HTML entities are not
1512             # entities within a Markdown code span.
1513             ('&', '&amp;'),
1514             # Do the angle bracket song and dance:
1515             ('<', '&lt;'),
1516             ('>', '&gt;'),
1517             # Now, escape characters that are magic in Markdown:
1518             ('*', self._escape_table['*']),
1519             ('_', self._escape_table['_']),
1520             ('{', self._escape_table['{']),
1521             ('}', self._escape_table['}']),
1522             ('[', self._escape_table['[']),
1523             (']', self._escape_table[']']),
1524             ('\\', self._escape_table['\\']),
1525         ]
1526         for before, after in replacements:
1527             text = text.replace(before, after)
1528         return text
1529
1530     _strong_re = re.compile(r"(\*\*)(?=\S)(.+?[*]*)(?<=\S)\*\*", re.S)
1531     _em_re = re.compile(r"(?<!:)(\/\/)(?=\S)(.+?)(?<![\t\n\r\f\v:])\/\/", re.S)
1532     _underline_re = re.compile(r"(__)(?=\S)(.+?)(?<=\S)__", re.S)
1533     _monospace_re = re.compile(r"(\'\')(?=\S)(.+?)(?<=\S)\'\'", re.S)
1534     def _do_italics_bold_underline_mono(self, text):
1535         text = self._strong_re.sub(r"<strong>\2</strong>", text)
1536         text = self._em_re.sub(r"<em>\2</em>", text)
1537         text = self._underline_re.sub(r"<span style='text-decoration:underline;'>\2</span>", text)
1538         text = self._monospace_re.sub(r"<span style='font-family:monospace;'>\2</span>", text)
1539         return text
1540
1541     # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1542     # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1543     # "twixt" can be written without an initial apostrophe. This is fine because
1544     # using scare quotes (single quotation marks) is rare.
1545     _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1546     _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1547         "round", "bout", "twixt", "nuff", "fraid", "sup"]
1548     def _do_smart_contractions(self, text):
1549         text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
1550         for c in self._contractions:
1551             text = text.replace("'%s" % c, "&#8217;%s" % c)
1552             text = text.replace("'%s" % c.capitalize(),
1553                 "&#8217;%s" % c.capitalize())
1554         return text
1555
1556     # Substitute double-quotes before single-quotes.
1557     _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
1558     _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
1559     _closing_single_quote_re = re.compile(r"(?<=\S)'")
1560     _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
1561     def _do_smart_punctuation(self, text):
1562         """Fancifies 'single quotes', "double quotes", and apostrophes.
1563         Converts --, ---, and ... into en dashes, em dashes, and ellipses.
1564
1565         Inspiration is: <http://daringfireball.net/projects/smartypants/>
1566         See "test/tm-cases/smarty_pants.text" for a full discussion of the
1567         support here and
1568         <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
1569         discussion of some diversion from the original SmartyPants.
1570         """
1571         if "'" in text: # guard for perf
1572             text = self._do_smart_contractions(text)
1573             text = self._opening_single_quote_re.sub("&#8216;", text)
1574             text = self._closing_single_quote_re.sub("&#8217;", text)
1575
1576         if '"' in text: # guard for perf
1577             text = self._opening_double_quote_re.sub("&#8220;", text)
1578             text = self._closing_double_quote_re.sub("&#8221;", text)
1579
1580         text = text.replace("---", "&#8212;")
1581         text = text.replace("--", "&#8211;")
1582         text = text.replace("...", "&#8230;")
1583         text = text.replace(" . . . ", "&#8230;")
1584         text = text.replace(". . .", "&#8230;")
1585         return text
1586
1587     _block_quote_re = re.compile(r'''
1588         (                           # Wrap whole match in \1
1589           (
1590             ^[ \t]*>[ \t]?          # '>' at the start of a line
1591               .+\n                  # rest of the first line
1592             (.+\n)*                 # subsequent consecutive lines
1593             \n*                     # blanks
1594           )+
1595         )
1596         ''', re.M | re.X)
1597     _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1598
1599     _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1600     def _dedent_two_spaces_sub(self, match):
1601         return re.sub(r'(?m)^  ', '', match.group(1))
1602
1603     def _block_quote_sub(self, match):
1604         bq = match.group(1)
1605         bq = self._bq_one_level_re.sub('', bq)  # trim one level of quoting
1606         bq = self._ws_only_line_re.sub('', bq)  # trim whitespace-only lines
1607         bq = self._run_block_gamut(bq)          # recurse
1608
1609         bq = re.sub('(?m)^', '  ', bq)
1610         # These leading spaces screw with <pre> content, so we need to fix that:
1611         bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1612
1613         return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1614
1615     def _do_block_quotes(self, text):
1616         if '>' not in text:
1617             return text
1618         return self._block_quote_re.sub(self._block_quote_sub, text)
1619
1620     def _form_paragraphs(self, text):
1621         # Strip leading and trailing lines:
1622         text = text.strip('\n')
1623
1624         # Wrap <p> tags.
1625         grafs = []
1626         for i, graf in enumerate(re.split(r"\n{2,}", text)):
1627             if graf in self.html_blocks:
1628                 # Unhashify HTML blocks
1629                 grafs.append(self.html_blocks[graf])
1630             else:
1631                 cuddled_list = None
1632                 if "cuddled-lists" in self.extras:
1633                     # Need to put back trailing '\n' for `_list_item_re`
1634                     # match at the end of the paragraph.
1635                     li = self._list_item_re.search(graf + '\n')
1636                     # Two of the same list marker in this paragraph: a likely
1637                     # candidate for a list cuddled to preceding paragraph
1638                     # text (issue 33). Note the `[-1]` is a quick way to
1639                     # consider numeric bullets (e.g. "1." and "2.") to be
1640                     # equal.
1641                     if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1642                         and li.group("marker")[-1] == li.group("next_marker")[-1]):
1643                         start = li.start()
1644                         cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1645                         assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
1646                         graf = graf[:start]
1647
1648                 # Wrap <p> tags.
1649                 graf = self._run_span_gamut(graf)
1650                 grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
1651
1652                 if cuddled_list:
1653                     grafs.append(cuddled_list)
1654
1655         return "\n\n".join(grafs)
1656
1657     def _add_footnotes(self, text):
1658         if self.footnotes:
1659             footer = [
1660                 '<div class="footnotes">',
1661                 '<hr' + self.empty_element_suffix,
1662                 '<ol>',
1663             ]
1664             for i, id in enumerate(self.footnote_ids):
1665                 if i != 0:
1666                     footer.append('')
1667                 footer.append('<li id="fn-%s">' % id)
1668                 footer.append(self._run_block_gamut(self.footnotes[id]))
1669                 backlink = ('<a href="#fnref-%s" '
1670                     'class="footnoteBackLink" '
1671                     'title="Jump back to footnote %d in the text.">'
1672                     '&#8617;</a>' % (id, i+1))
1673                 if footer[-1].endswith("</p>"):
1674                     footer[-1] = footer[-1][:-len("</p>")] \
1675                         + '&nbsp;' + backlink + "</p>"
1676                 else:
1677                     footer.append("\n<p>%s</p>" % backlink)
1678                 footer.append('</li>')
1679             footer.append('</ol>')
1680             footer.append('</div>')
1681             return text + '\n\n' + '\n'.join(footer)
1682         else:
1683             return text
1684
1685     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1686     #   http://bumppo.net/projects/amputator/
1687     _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1688     _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1689     _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I)
1690
1691     def _encode_amps_and_angles(self, text):
1692         # Smart processing for ampersands and angle brackets that need
1693         # to be encoded.
1694         text = self._ampersand_re.sub('&amp;', text)
1695
1696         # Encode naked <'s
1697         text = self._naked_lt_re.sub('&lt;', text)
1698
1699         # Encode naked >'s
1700         # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1701         # Markdown) don't do this.
1702         text = self._naked_gt_re.sub('&gt;', text)
1703         return text
1704
1705     def _encode_backslash_escapes(self, text):
1706         for ch, escape in self._escape_table.items():
1707             text = text.replace("\\"+ch, escape)
1708         return text
1709
1710     _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1711     def _auto_link_sub(self, match):
1712         g1 = match.group(1)
1713         return '<a href="%s">%s</a>' % (g1, g1)
1714
1715     _auto_email_link_re = re.compile(r"""
1716           <
1717            (?:mailto:)?
1718           (
1719               [-.\w]+
1720               \@
1721               [-\w]+(\.[-\w]+)*\.[a-z]+
1722           )
1723           >
1724         """, re.I | re.X | re.U)
1725     def _auto_email_link_sub(self, match):
1726         return self._encode_email_address(
1727             self._unescape_special_chars(match.group(1)))
1728
1729     def _do_auto_links(self, text):
1730         text = self._auto_link_re.sub(self._auto_link_sub, text)
1731         text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1732         return text
1733
1734     def _encode_email_address(self, addr):
1735         #  Input: an email address, e.g. "foo@example.com"
1736         #
1737         #  Output: the email address as a mailto link, with each character
1738         #      of the address encoded as either a decimal or hex entity, in
1739         #      the hopes of foiling most address harvesting spam bots. E.g.:
1740         #
1741         #    <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1742         #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1743         #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1744         #
1745         #  Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1746         #  mailing list: <http://tinyurl.com/yu7ue>
1747         chars = [_xml_encode_email_char_at_random(ch)
1748                  for ch in "mailto:" + addr]
1749         # Strip the mailto: from the visible part.
1750         addr = '<a href="%s">%s</a>' \
1751                % (''.join(chars), ''.join(chars[7:]))
1752         return addr
1753
1754     def _do_link_patterns(self, text):
1755         """Caveat emptor: there isn't much guarding against link
1756         patterns being formed inside other standard Markdown links, e.g.
1757         inside a [link def][like this].
1758
1759         Dev Notes: *Could* consider prefixing regexes with a negative
1760         lookbehind assertion to attempt to guard against this.
1761         """
1762         link_from_hash = {}
1763         for regex, repl in self.link_patterns:
1764             replacements = []
1765             for match in regex.finditer(text):
1766                 if hasattr(repl, "__call__"):
1767                     href = repl(match)
1768                 else:
1769                     href = match.expand(repl)
1770                 replacements.append((match.span(), href))
1771             for (start, end), href in reversed(replacements):
1772                 escaped_href = (
1773                     href.replace('"', '&quot;')  # b/c of attr quote
1774                         # To avoid markdown <em> and <strong>:
1775                         .replace('*', self._escape_table['*'])
1776                         .replace('_', self._escape_table['_']))
1777                 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1778                 hash = _hash_text(link)
1779                 link_from_hash[hash] = link
1780                 text = text[:start] + hash + text[end:]
1781         for hash, link in link_from_hash.items():
1782             text = text.replace(hash, link)
1783         return text
1784
1785     def _unescape_special_chars(self, text):
1786         # Swap back in all the special characters we've hidden.
1787         for ch, hash in self._escape_table.items():
1788             text = text.replace(hash, ch)
1789         return text
1790
1791     def _outdent(self, text):
1792         # Remove one level of line-leading tabs or spaces
1793         return self._outdent_re.sub('', text)
1794
1795
1796 class MarkdownWithExtras(Markdown):
1797     """A markdowner class that enables most extras:
1798
1799     - footnotes
1800     - code-color (only has effect if 'pygments' Python module on path)
1801
1802     These are not included:
1803     - pyshell (specific to Python-related documenting)
1804     - code-friendly (because it *disables* part of the syntax)
1805     - link-patterns (because you need to specify some actual
1806       link-patterns anyway)
1807     """
1808     extras = ["footnotes", "code-color"]
1809
1810
1811 #---- internal support functions
1812
1813 class UnicodeWithAttrs(str):
1814     """A subclass of unicode used for the return value of conversion to
1815     possibly attach some attributes. E.g. the "toc_html" attribute when
1816     the "toc" extra is used.
1817     """
1818     _toc = None
1819     @property
1820     def toc_html(self):
1821         """Return the HTML for the current TOC.
1822
1823         This expects the `_toc` attribute to have been set on this instance.
1824         """
1825         if self._toc is None:
1826             return None
1827
1828         def indent():
1829             return '  ' * (len(h_stack) - 1)
1830         lines = []
1831         h_stack = [0]   # stack of header-level numbers
1832         for level, id, name in self._toc:
1833             if level > h_stack[-1]:
1834                 lines.append("%s<ul>" % indent())
1835                 h_stack.append(level)
1836             elif level == h_stack[-1]:
1837                 lines[-1] += "</li>"
1838             else:
1839                 while level < h_stack[-1]:
1840                     h_stack.pop()
1841                     if not lines[-1].endswith("</li>"):
1842                         lines[-1] += "</li>"
1843                     lines.append("%s</ul></li>" % indent())
1844             lines.append(u'%s<li><a href="#%s">%s</a>' % (
1845                 indent(), id, name))
1846         while len(h_stack) > 1:
1847             h_stack.pop()
1848             if not lines[-1].endswith("</li>"):
1849                 lines[-1] += "</li>"
1850             lines.append("%s</ul>" % indent())
1851         return '\n'.join(lines) + '\n'
1852
1853
1854 ## {{{ http://code.activestate.com/recipes/577257/ (r1)
1855 _slugify_strip_re = re.compile(r'[^\w\s-]')
1856 _slugify_hyphenate_re = re.compile(r'[-\s]+')
1857 def _slugify(value):
1858     """
1859     Normalizes string, converts to lowercase, removes non-alpha characters,
1860     and converts spaces to hyphens.
1861
1862     From Django's "django/template/defaultfilters.py".
1863     """
1864     import unicodedata
1865     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
1866     value = unicode(_slugify_strip_re.sub('', value).strip().lower())
1867     return _slugify_hyphenate_re.sub('-', value)
1868 ## end of http://code.activestate.com/recipes/577257/ }}}
1869
1870
1871 # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1872 def _curry(*args, **kwargs):
1873     function, args = args[0], args[1:]
1874     def result(*rest, **kwrest):
1875         combined = kwargs.copy()
1876         combined.update(kwrest)
1877         return function(*args + rest, **combined)
1878     return result
1879
1880 # Recipe: regex_from_encoded_pattern (1.0)
1881 def _regex_from_encoded_pattern(s):
1882     """'foo'    -> re.compile(re.escape('foo'))
1883        '/foo/'  -> re.compile('foo')
1884        '/foo/i' -> re.compile('foo', re.I)
1885     """
1886     if s.startswith('/') and s.rfind('/') != 0:
1887         # Parse it: /PATTERN/FLAGS
1888         idx = s.rfind('/')
1889         pattern, flags_str = s[1:idx], s[idx+1:]
1890         flag_from_char = {
1891             "i": re.IGNORECASE,
1892             "l": re.LOCALE,
1893             "s": re.DOTALL,
1894             "m": re.MULTILINE,
1895             "u": re.UNICODE,
1896         }
1897         flags = 0
1898         for char in flags_str:
1899             try:
1900                 flags |= flag_from_char[char]
1901             except KeyError:
1902                 raise ValueError("unsupported regex flag: '%s' in '%s' "
1903                                  "(must be one of '%s')"
1904                                  % (char, s, ''.join(flag_from_char.keys())))
1905         return re.compile(s[1:idx], flags)
1906     else: # not an encoded regex
1907         return re.compile(re.escape(s))
1908
1909 # Recipe: dedent (0.1.2)
1910 def _dedentlines(lines, tabsize=8, skip_first_line=False):
1911     """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1912
1913         "lines" is a list of lines to dedent.
1914         "tabsize" is the tab width to use for indent width calculations.
1915         "skip_first_line" is a boolean indicating if the first line should
1916             be skipped for calculating the indent width and for dedenting.
1917             This is sometimes useful for docstrings and similar.
1918
1919     Same as dedent() except operates on a sequence of lines. Note: the
1920     lines list is modified **in-place**.
1921     """
1922     DEBUG = False
1923     if DEBUG:
1924         print(f"dedent: dedent(..., tabsize={tabsize}, skip_first_line={skip_first_line})")
1925     indents = []
1926     margin = None
1927     for i, line in enumerate(lines):
1928         if i == 0 and skip_first_line: continue
1929         indent = 0
1930         for ch in line:
1931             if ch == ' ':
1932                 indent += 1
1933             elif ch == '\t':
1934                 indent += tabsize - (indent % tabsize)
1935             elif ch in '\r\n':
1936                 continue # skip all-whitespace lines
1937             else:
1938                 break
1939         else:
1940             continue # skip all-whitespace lines
1941         if DEBUG: print(f"dedent: indent={indent}: {line}")
1942         if margin is None:
1943             margin = indent
1944         else:
1945             margin = min(margin, indent)
1946     if DEBUG: print(f"dedent: margin={margin}")
1947
1948     if margin is not None and margin > 0:
1949         for i, line in enumerate(lines):
1950             if i == 0 and skip_first_line: continue
1951             removed = 0
1952             for j, ch in enumerate(line):
1953                 if ch == ' ':
1954                     removed += 1
1955                 elif ch == '\t':
1956                     removed += tabsize - (removed % tabsize)
1957                 elif ch in '\r\n':
1958                     if DEBUG: print("dedent: {line}: EOL -> strip up to EOL")
1959                     lines[i] = lines[i][j:]
1960                     break
1961                 else:
1962                     raise ValueError("unexpected non-whitespace char %r in "
1963                                      "line %r while removing %d-space margin"
1964                                      % (ch, line, margin))
1965                 if DEBUG:
1966                     print(f"dedent: {line}: {ch} -> removed {removed}/{margin}")
1967                 if removed == margin:
1968                     lines[i] = lines[i][j+1:]
1969                     break
1970                 elif removed > margin:
1971                     lines[i] = ' '*(removed-margin) + lines[i][j+1:]
1972                     break
1973             else:
1974                 if removed:
1975                     lines[i] = lines[i][removed:]
1976     return lines
1977
1978 def _dedent(text, tabsize=8, skip_first_line=False):
1979     """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
1980
1981         "text" is the text to dedent.
1982         "tabsize" is the tab width to use for indent width calculations.
1983         "skip_first_line" is a boolean indicating if the first line should
1984             be skipped for calculating the indent width and for dedenting.
1985             This is sometimes useful for docstrings and similar.
1986
1987     textwrap.dedent(s), but don't expand tabs to spaces
1988     """
1989     lines = text.splitlines(1)
1990     _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
1991     return ''.join(lines)
1992
1993
1994 class _memoized(object):
1995    """Decorator that caches a function's return value each time it is called.
1996    If called later with the same arguments, the cached value is returned, and
1997    not re-evaluated.
1998
1999    http://wiki.python.org/moin/PythonDecoratorLibrary
2000    """
2001    def __init__(self, func):
2002       self.func = func
2003       self.cache = {}
2004    def __call__(self, *args):
2005       try:
2006          return self.cache[args]
2007       except KeyError:
2008          self.cache[args] = value = self.func(*args)
2009          return value
2010       except TypeError:
2011          # uncachable -- for instance, passing a list as an argument.
2012          # Better to not cache than to blow up entirely.
2013          return self.func(*args)
2014    def __repr__(self):
2015       """Return the function's docstring."""
2016       return self.func.__doc__
2017
2018
2019 def _xml_oneliner_re_from_tab_width(tab_width):
2020     """Standalone XML processing instruction regex."""
2021     return re.compile(r"""
2022         (?:
2023             (?<=\n\n)       # Starting after a blank line
2024             |               # or
2025             \A\n?           # the beginning of the doc
2026         )
2027         (                           # save in $1
2028             [ ]{0,%d}
2029             (?:
2030                 <\?\w+\b\s+.*?\?>   # XML processing instruction
2031                 |
2032                 <\w+:\w+\b\s+.*?/>  # namespaced single tag
2033             )
2034             [ \t]*
2035             (?=\n{2,}|\Z)       # followed by a blank line or end of document
2036         )
2037         """ % (tab_width - 1), re.X)
2038 _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2039
2040 def _hr_tag_re_from_tab_width(tab_width):
2041      return re.compile(r"""
2042         (?:
2043             (?<=\n\n)       # Starting after a blank line
2044             |               # or
2045             \A\n?           # the beginning of the doc
2046         )
2047         (                       # save in \1
2048             [ ]{0,%d}
2049             <(hr)               # start tag = \2
2050             \b                  # word break
2051             ([^<>])*?           #
2052             /?>                 # the matching end tag
2053             [ \t]*
2054             (?=\n{2,}|\Z)       # followed by a blank line or end of document
2055         )
2056         """ % (tab_width - 1), re.X)
2057 _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2058
2059
2060 def _xml_escape_attr(attr, skip_single_quote=True):
2061     """Escape the given string for use in an HTML/XML tag attribute.
2062
2063     By default this doesn't bother with escaping `'` to `&#39;`, presuming that
2064     the tag attribute is surrounded by double quotes.
2065     """
2066     escaped = (attr
2067         .replace('&', '&amp;')
2068         .replace('"', '&quot;')
2069         .replace('<', '&lt;')
2070         .replace('>', '&gt;'))
2071     if not skip_single_quote:
2072         escaped = escaped.replace("'", "&#39;")
2073     return escaped
2074
2075
2076 def _xml_encode_email_char_at_random(ch):
2077     r = random()
2078     # Roughly 10% raw, 45% hex, 45% dec.
2079     # '@' *must* be encoded. I [John Gruber] insist.
2080     # Issue 26: '_' must be encoded.
2081     if r > 0.9 and ch not in "@_":
2082         return ch
2083     elif r < 0.45:
2084         # The [1:] is to drop leading '0': 0x63 -> x63
2085         return '&#%s;' % hex(ord(ch))[1:]
2086     else:
2087         return '&#%s;' % ord(ch)
2088
2089
2090
2091 #---- mainline
2092
2093 class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2094     """An optparse formatter that does NOT reflow the description."""
2095     def format_description(self, description):
2096         return description or ""
2097
2098 def _test():
2099     import doctest
2100     doctest.testmod()
2101
2102 def main(argv=None):
2103     if argv is None:
2104         argv = sys.argv
2105     if not logging.root.handlers:
2106         logging.basicConfig()
2107
2108     usage = "usage: %prog [PATHS...]"
2109     version = "%prog "+__version__
2110     parser = optparse.OptionParser(prog="markdown2", usage=usage,
2111         version=version, description=cmdln_desc,
2112         formatter=_NoReflowFormatter())
2113     parser.add_option("-v", "--verbose", dest="log_level",
2114                       action="store_const", const=logging.DEBUG,
2115                       help="more verbose output")
2116     parser.add_option("--encoding",
2117                       help="specify encoding of text content")
2118     parser.add_option("--html4tags", action="store_true", default=False,
2119                       help="use HTML 4 style for empty element tags")
2120     parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2121                       help="sanitize literal HTML: 'escape' escapes "
2122                            "HTML meta chars, 'replace' replaces with an "
2123                            "[HTML_REMOVED] note")
2124     parser.add_option("-x", "--extras", action="append",
2125                       help="Turn on specific extra features (not part of "
2126                            "the core Markdown spec). See above.")
2127     parser.add_option("--use-file-vars",
2128                       help="Look for and use Emacs-style 'markdown-extras' "
2129                            "file var to turn on extras. See "
2130                            "<https://github.com/trentm/python-markdown2/wiki/Extras>")
2131     parser.add_option("--link-patterns-file",
2132                       help="path to a link pattern file")
2133     parser.add_option("--self-test", action="store_true",
2134                       help="run internal self-tests (some doctests)")
2135     parser.add_option("--compare", action="store_true",
2136                       help="run against Markdown.pl as well (for testing)")
2137     parser.set_defaults(log_level=logging.INFO, compare=False,
2138                         encoding="utf-8", safe_mode=None, use_file_vars=False)
2139     opts, paths = parser.parse_args()
2140     log.setLevel(opts.log_level)
2141
2142     if opts.self_test:
2143         return _test()
2144
2145     if opts.extras:
2146         extras = {}
2147         for s in opts.extras:
2148             splitter = re.compile("[,;: ]+")
2149             for e in splitter.split(s):
2150                 if '=' in e:
2151                     ename, earg = e.split('=', 1)
2152                     try:
2153                         earg = int(earg)
2154                     except ValueError:
2155                         pass
2156                 else:
2157                     ename, earg = e, None
2158                 extras[ename] = earg
2159     else:
2160         extras = None
2161
2162     if opts.link_patterns_file:
2163         link_patterns = []
2164         f = open(opts.link_patterns_file)
2165         try:
2166             for i, line in enumerate(f.readlines()):
2167                 if not line.strip(): continue
2168                 if line.lstrip().startswith("#"): continue
2169                 try:
2170                     pat, href = line.rstrip().rsplit(None, 1)
2171                 except ValueError:
2172                     raise MarkdownError("%s:%d: invalid link pattern line: %r"
2173                                         % (opts.link_patterns_file, i+1, line))
2174                 link_patterns.append(
2175                     (_regex_from_encoded_pattern(pat), href))
2176         finally:
2177             f.close()
2178     else:
2179         link_patterns = None
2180
2181     from os.path import join, dirname, abspath, exists
2182     markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2183                        "Markdown.pl")
2184     for path in paths:
2185         if opts.compare:
2186             print("==== Markdown.pl ====")
2187             perl_cmd = 'perl %s "%s"' % (markdown_pl, path)
2188             o = os.popen(perl_cmd)
2189             perl_html = o.read()
2190             o.close()
2191             sys.stdout.write(perl_html)
2192             print("==== markdown2.py ====")
2193         html = markdown_path(path, encoding=opts.encoding,
2194                              html4tags=opts.html4tags,
2195                              safe_mode=opts.safe_mode,
2196                              extras=extras, link_patterns=link_patterns,
2197                              use_file_vars=opts.use_file_vars)
2198         sys.stdout.write(
2199             html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2200         if extras and "toc" in extras:
2201             log.debug("toc_html: " +
2202                 html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2203         if opts.compare:
2204             test_dir = join(dirname(dirname(abspath(__file__))), "test")
2205             if exists(join(test_dir, "test_markdown2.py")):
2206                 sys.path.insert(0, test_dir)
2207                 from test_markdown2 import norm_html_from_html
2208                 norm_html = norm_html_from_html(html)
2209                 norm_perl_html = norm_html_from_html(perl_html)
2210             else:
2211                 norm_html = html
2212                 norm_perl_html = perl_html
2213             did_match = (norm_perl_html == norm_html)
2214             print(f"==== match? {did_match} ====")
2215
2216
2217 if __name__ == "__main__":
2218     sys.exit( main(sys.argv) )
2219