1 # I took python-markdown2 and modified a few syntax elements to behave more
2 # like dokuwiki. The original python-markdown2 can be found at:
4 # https://github.com/trentm/python-markdown2
8 # Copyright (c) 2007-2008 ActiveState Corp.
9 # License: MIT (http://www.opensource.org/licenses/mit-license.php)
11 r"""A fast and complete Python implementation of Markdown.
13 [from http://daringfireball.net/projects/markdown/]
14 > Markdown is a text-to-HTML filter; it translates an easy-to-read /
15 > easy-to-write structured text format into HTML. Markdown's text
16 > format is most similar to that of plain text email, and supports
17 > features such as headers, *emphasis*, code blocks, blockquotes, and
20 > Markdown's syntax is designed not as a generic markup language, but
21 > specifically to serve as a front-end to (X)HTML. You can use span-level
22 > HTML tags anywhere in a Markdown document, and you can use block level
23 > HTML tags (like <div> and <table> as well).
28 >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
29 u'<p><em>boo!</em></p>\n'
31 >>> markdowner = Markdown()
32 >>> markdowner.convert("*boo!*")
33 u'<p><em>boo!</em></p>\n'
34 >>> markdowner.convert("**boom!**")
35 u'<p><strong>boom!</strong></p>\n'
37 This implementation of Markdown implements the full "core" syntax plus a
38 number of extras (e.g., code syntax coloring, footnotes) as described on
39 <https://github.com/trentm/python-markdown2/wiki/Extras>.
42 cmdln_desc = """A fast and complete Python implementation of Markdown, a
43 text-to-HTML conversion tool for web writers.
45 Supported extras (see -x|--extras option below):
46 * code-friendly: Disable _ and __ for em and strong.
47 * code-color: Pygments-based syntax coloring of <code> sections.
48 * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
49 * footnotes: Support footnotes as in use on daringfireball.net and
50 implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
51 * header-ids: Adds "id" attributes to headers. The id value is a slug of
53 * html-classes: Takes a dict mapping html tag names (lowercase) to a
54 string to use for a "class" tag attribute. Currently only supports
55 "pre" and "code" tags. Add an issue if you require this for other tags.
56 * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
57 have markdown processing be done on its contents. Similar to
58 <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
60 * pyshell: Treats unindented Python interactive shell sessions as <code>
62 * link-patterns: Auto-link given regex patterns in text (e.g. bug number
63 references, revision number references).
64 * smarty-pants: Replaces ' and " with curly quotation marks or curly
65 apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
67 * toc: The returned HTML string gets a new "toc_html" attribute which is
68 a Table of Contents for the document. (experimental)
69 * xml: Passes one-liner processing instructions and namespaced XML tags.
70 * wiki-tables: Google Code Wiki-style tables. See
71 <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
75 # - There is already a Python markdown processor
76 # (http://www.freewisdom.org/projects/python-markdown/).
77 # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
78 # not yet sure if there implications with this. Compare 'pydoc sre'
79 # and 'perldoc perlre'.
81 __version_info__ = (1, 0, 1, 19) # first three nums match Markdown.pl
82 __version__ = '1.0.1.19'
83 __author__ = "Trent Mick"
87 from pprint import pprint
91 from hashlib import md5
95 from random import random, randint
97 from urllib.parse import quote
101 #---- Python version compat
103 if sys.version_info[:2] < (2,4):
104 from sets import Set as set
105 def reversed(sequence):
106 for i in sequence[::-1]:
108 def _unicode_decode(s, encoding, errors='xmlcharrefreplace'):
109 return unicode(s, encoding, errors)
111 def _unicode_decode(s, encoding, errors='strict'):
112 return s.decode(encoding, errors)
118 log = logging.getLogger("markdown")
120 DEFAULT_TAB_WIDTH = 4
126 SECRET_SALT = str(randint(0, 1000000)).encode('utf-8')
128 SECRET_SALT = str(uuid.uuid4()).encode('utf-8')
130 #return md5(s).hexdigest() # Markdown.pl effectively does this.
131 return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
133 return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
135 # Table of hash values for escaped characters:
136 g_escape_table = dict([(ch, _hash_ascii(ch))
137 for ch in '\\`*_{}[]()>#+-.!'])
143 class MarkdownError(Exception):
150 def markdown_path(path, encoding="utf-8",
151 html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
152 safe_mode=None, extras=None, link_patterns=None,
153 use_file_vars=False):
154 fp = codecs.open(path, 'r', encoding)
157 return Markdown(html4tags=html4tags, tab_width=tab_width,
158 safe_mode=safe_mode, extras=extras,
159 link_patterns=link_patterns,
160 use_file_vars=use_file_vars).convert(text)
162 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
163 safe_mode=None, extras=None, link_patterns=None,
164 use_file_vars=False):
165 return Markdown(html4tags=html4tags, tab_width=tab_width,
166 safe_mode=safe_mode, extras=extras,
167 link_patterns=link_patterns,
168 use_file_vars=use_file_vars).convert(text)
170 class Markdown(object):
171 # The dict of "extras" to enable in processing -- a mapping of
172 # extra name to argument for the extra. Most extras do not have an
173 # argument, in which case the value is None.
175 # This can be set via (a) subclassing and (b) the constructor
183 html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py
185 # Used to track when we're inside an ordered or unordered list
186 # (see _ProcessListItems() for details):
189 _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
191 def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
192 extras=None, link_patterns=None, use_file_vars=False):
194 self.empty_element_suffix = ">"
196 self.empty_element_suffix = " />"
197 self.tab_width = tab_width
199 # For compatibility with earlier markdown2.py and with
200 # markdown.py's safe_mode being a boolean,
201 # safe_mode == True -> "replace"
202 if safe_mode is True:
203 self.safe_mode = "replace"
205 self.safe_mode = safe_mode
207 # Massaging and building the "extras" info.
208 if self.extras is None:
210 elif not isinstance(self.extras, dict):
211 self.extras = dict([(e, None) for e in self.extras])
213 if not isinstance(extras, dict):
214 extras = dict([(e, None) for e in extras])
215 self.extras.update(extras)
216 assert isinstance(self.extras, dict)
217 if "toc" in self.extras and not "header-ids" in self.extras:
218 self.extras["header-ids"] = None # "toc" implies "header-ids"
219 self._instance_extras = self.extras.copy()
221 self.link_patterns = link_patterns
222 self.use_file_vars = use_file_vars
223 self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
225 self._escape_table = g_escape_table.copy()
226 if "smarty-pants" in self.extras:
227 self._escape_table['"'] = _hash_ascii('"')
228 self._escape_table["'"] = _hash_ascii("'")
233 self.html_blocks = {}
236 self.extras = self._instance_extras.copy()
237 if "footnotes" in self.extras:
239 self.footnote_ids = []
240 if "header-ids" in self.extras:
241 self._count_from_header_id = {} # no `defaultdict` in Python 2.4
243 def convert(self, text):
244 """Convert the given text."""
245 # Main function. The order in which other subs are called here is
246 # essential. Link and image substitutions need to happen before
247 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
248 # and <img> tags get encoded.
250 # Clear the global hashes. If we don't clear these, you get conflicts
251 # from other articles when generating a page which contains more than
252 # one article (e.g. an index page that shows the N most recent
256 if not isinstance(text, str):
257 #TODO: perhaps shouldn't presume UTF-8 for string input?
258 text = text.decode('utf-8')
260 if self.use_file_vars:
261 # Look for emacs-style file variable hints.
262 emacs_vars = self._get_emacs_vars(text)
263 if "markdown-extras" in emacs_vars:
264 splitter = re.compile("[ ,]+")
265 for e in splitter.split(emacs_vars["markdown-extras"]):
267 ename, earg = e.split('=', 1)
273 ename, earg = e, None
274 self.extras[ename] = earg
276 # Standardize line endings:
277 text = re.sub("\r\n|\r", "\n", text)
279 # Make sure $text ends with a couple of newlines:
282 # Convert all tabs to spaces.
283 text = self._detab(text)
285 # Strip any lines consisting only of spaces and tabs.
286 # This makes subsequent regexen easier to write, because we can
287 # match consecutive blank lines with /\n+/ instead of something
288 # contorted like /[ \t]*\n+/ .
289 text = self._ws_only_line_re.sub("", text)
292 text = self._hash_html_spans(text)
294 # Turn block-level HTML blocks into hash entries
295 text = self._hash_html_blocks(text, raw=True)
297 # Strip link definitions, store in hashes.
298 if "footnotes" in self.extras:
299 # Must do footnotes first because an unlucky footnote defn
300 # looks like a link defn:
301 # [^4]: this "looks like a link defn"
302 text = self._strip_footnote_definitions(text)
303 text = self._strip_link_definitions(text)
305 text = self._run_block_gamut(text)
307 if "footnotes" in self.extras:
308 text = self._add_footnotes(text)
310 text = self.postprocess(text)
312 text = self._unescape_special_chars(text)
315 text = self._unhash_html_spans(text)
319 rv = UnicodeWithAttrs(text)
320 if "toc" in self.extras:
324 def postprocess(self, text):
325 """A hook for subclasses to do some postprocessing of the html, if
326 desired. This is called before unescaping of special chars and
327 unhashing of raw HTML spans.
331 _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
332 # This regular expression is intended to match blocks like this:
333 # PREFIX Local Variables: SUFFIX
334 # PREFIX mode: Tcl SUFFIX
337 # - "[ \t]" is used instead of "\s" to specifically exclude newlines
338 # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
339 # not like anything other than Unix-style line terminators.
340 _emacs_local_vars_pat = re.compile(r"""^
341 (?P<prefix>(?:[^\r\n|\n|\r])*?)
342 [\ \t]*Local\ Variables:[\ \t]*
343 (?P<suffix>.*?)(?:\r\n|\n|\r)
344 (?P<content>.*?\1End:)
345 """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
347 def _get_emacs_vars(self, text):
348 """Return a dictionary of emacs-style local variables.
350 Parsing is done loosely according to this spec (and according to
351 some in-practice deviations from this):
352 http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
355 SIZE = pow(2, 13) # 8kB
357 # Search near the start for a '-*-'-style one-liner of variables.
360 match = self._emacs_oneliner_vars_pat.search(head)
362 emacs_vars_str = match.group(1)
363 assert '\n' not in emacs_vars_str
364 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
366 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
367 # While not in the spec, this form is allowed by emacs:
369 # where the implied "variable" is "mode". This form
370 # is only allowed if there are no other variables.
371 emacs_vars["mode"] = emacs_var_strs[0].strip()
373 for emacs_var_str in emacs_var_strs:
375 variable, value = emacs_var_str.strip().split(':', 1)
377 log.debug("emacs variables error: malformed -*- "
378 "line: %r", emacs_var_str)
380 # Lowercase the variable name because Emacs allows "Mode"
381 # or "mode" or "MoDe", etc.
382 emacs_vars[variable.lower()] = value.strip()
385 if "Local Variables" in tail:
386 match = self._emacs_local_vars_pat.search(tail)
388 prefix = match.group("prefix")
389 suffix = match.group("suffix")
390 lines = match.group("content").splitlines(0)
391 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
392 # % (prefix, suffix, match.group("content"), lines)
394 # Validate the Local Variables block: proper prefix and suffix
396 for i, line in enumerate(lines):
397 if not line.startswith(prefix):
398 log.debug("emacs variables error: line '%s' "
399 "does not use proper prefix '%s'"
402 # Don't validate suffix on last line. Emacs doesn't care,
404 if i != len(lines)-1 and not line.endswith(suffix):
405 log.debug("emacs variables error: line '%s' "
406 "does not use proper suffix '%s'"
410 # Parse out one emacs var per line.
412 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
413 if prefix: line = line[len(prefix):] # strip prefix
414 if suffix: line = line[:-len(suffix)] # strip suffix
417 variable = continued_for
418 if line.endswith('\\'):
419 line = line[:-1].rstrip()
422 emacs_vars[variable] += ' ' + line
425 variable, value = line.split(':', 1)
427 log.debug("local variables error: missing colon "
428 "in local variables entry: '%s'" % line)
430 # Do NOT lowercase the variable name, because Emacs only
431 # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
432 value = value.strip()
433 if value.endswith('\\'):
434 value = value[:-1].rstrip()
435 continued_for = variable
438 emacs_vars[variable] = value
441 for var, val in emacs_vars.items():
442 if len(val) > 1 and (val.startswith('"') and val.endswith('"')
443 or val.startswith('"') and val.endswith('"')):
444 emacs_vars[var] = val[1:-1]
448 # Cribbed from a post by Bart Lateur:
449 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
450 _detab_re = re.compile(r'(.*?)\t', re.M)
451 def _detab_sub(self, match):
453 return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
454 def _detab(self, text):
455 r"""Remove (leading?) tabs from a file.
458 >>> m._detab("\tfoo")
460 >>> m._detab(" \tfoo")
462 >>> m._detab("\t foo")
466 >>> m._detab(" foo\n\tbar\tblam")
471 return self._detab_re.subn(self._detab_sub, text)[0]
473 # I broke out the html5 tags here and add them to _block_tags_a and
474 # _block_tags_b. This way html5 tags are easy to keep track of.
475 _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
477 _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
478 _block_tags_a += _html5tags
480 _strict_tag_block_re = re.compile(r"""
482 ^ # start of line (with re.M)
483 <(%s) # start tag = \2
485 (.*\n)*? # any number of lines, minimally matching
486 </\2> # the matching end tag
487 [ \t]* # trailing spaces/tabs
488 (?=\n+|\Z) # followed by a newline or end of document
493 _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
494 _block_tags_b += _html5tags
496 _liberal_tag_block_re = re.compile(r"""
498 ^ # start of line (with re.M)
499 <(%s) # start tag = \2
501 (.*\n)*? # any number of lines, minimally matching
502 .*</\2> # the matching end tag
503 [ \t]* # trailing spaces/tabs
504 (?=\n+|\Z) # followed by a newline or end of document
509 _html_markdown_attr_re = re.compile(
510 r'''\s+markdown=("1"|'1')''')
511 def _hash_html_block_sub(self, match, raw=False):
512 html = match.group(1)
513 if raw and self.safe_mode:
514 html = self._sanitize_html(html)
515 elif 'markdown-in-html' in self.extras and 'markdown=' in html:
516 first_line = html.split('\n', 1)[0]
517 m = self._html_markdown_attr_re.search(first_line)
519 lines = html.split('\n')
520 middle = '\n'.join(lines[1:-1])
521 last_line = lines[-1]
522 first_line = first_line[:m.start()] + first_line[m.end():]
523 f_key = _hash_text(first_line)
524 self.html_blocks[f_key] = first_line
525 l_key = _hash_text(last_line)
526 self.html_blocks[l_key] = last_line
527 return ''.join(["\n\n", f_key,
528 "\n\n", middle, "\n\n",
530 key = _hash_text(html)
531 self.html_blocks[key] = html
532 return "\n\n" + key + "\n\n"
534 def _hash_html_blocks(self, text, raw=False):
535 """Hashify HTML blocks
537 We only want to do this for block-level HTML tags, such as headers,
538 lists, and tables. That's because we still want to wrap <p>s around
539 "paragraphs" that are wrapped in non-block-level tags, such as anchors,
540 phrase emphasis, and spans. The list of tags we're looking for is
543 @param raw {boolean} indicates if these are raw HTML blocks in
544 the original source. It makes a difference in "safe" mode.
549 # Pass `raw` value into our calls to self._hash_html_block_sub.
550 hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
552 # First, look for nested blocks, e.g.:
555 # tags for inner block must be indented.
559 # The outermost tags must start at the left margin for this to match, and
560 # the inner nested divs must be indented.
561 # We need to do this before the next, more liberal match, because the next
562 # match will start at the first `<div>` and stop at the first `</div>`.
563 text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
565 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
566 text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
568 # Special case just for <hr />. It was easier to make a special
569 # case than to make the other regex more complicated.
571 _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
572 text = _hr_tag_re.sub(hash_html_block_sub, text)
574 # Special case for standalone HTML comments:
578 # Delimiters for next comment block.
580 start_idx = text.index("<!--", start)
581 except ValueError as ex:
584 end_idx = text.index("-->", start_idx) + 3
585 except ValueError as ex:
588 # Start position for next comment block search.
591 # Validate whitespace before comment.
593 # - Up to `tab_width - 1` spaces before start_idx.
594 for i in range(self.tab_width - 1):
595 if text[start_idx - 1] != ' ':
600 # - Must be preceded by 2 newlines or hit the start of
604 elif start_idx == 1 and text[0] == '\n':
605 start_idx = 0 # to match minute detail of Markdown.pl regex
606 elif text[start_idx-2:start_idx] == '\n\n':
611 # Validate whitespace after comment.
612 # - Any number of spaces and tabs.
613 while end_idx < len(text):
614 if text[end_idx] not in ' \t':
617 # - Must be following by 2 newlines or hit end of text.
618 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
621 # Escape and hash (must match `_hash_html_block_sub`).
622 html = text[start_idx:end_idx]
623 if raw and self.safe_mode:
624 html = self._sanitize_html(html)
625 key = _hash_text(html)
626 self.html_blocks[key] = html
627 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
629 if "xml" in self.extras:
630 # Treat XML processing instructions and namespaced one-liner
631 # tags as if they were block HTML tags. E.g., if standalone
632 # (i.e. are their own paragraph), the following do not get
633 # wrapped in a <p> tag:
636 # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
637 _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
638 text = _xml_oneliner_re.sub(hash_html_block_sub, text)
642 def _strip_link_definitions(self, text):
643 # Strips link definitions from text, stores the URLs and titles in
645 less_than_tab = self.tab_width - 1
647 # Link defs are in the form:
648 # [id]: url "optional title"
649 _link_def_re = re.compile(r"""
650 ^[ ]{0,%d}\[(.+)\]: # id = \1
652 \n? # maybe *one* newline
657 \n? # maybe one newline
659 (?<=\s) # lookbehind for whitespace
661 ([^\n]*) # title = \3
664 )? # title is optional
666 """ % less_than_tab, re.X | re.M | re.U)
667 return _link_def_re.sub(self._extract_link_def_sub, text)
669 def _extract_link_def_sub(self, match):
670 id, url, title = match.groups()
671 key = id.lower() # Link IDs are case-insensitive
672 self.urls[key] = self._encode_amps_and_angles(url)
674 self.titles[key] = title
677 def _extract_footnote_def_sub(self, match):
678 id, text = match.groups()
679 text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
680 normed_id = re.sub(r'\W', '-', id)
681 # Ensure footnote text ends with a couple newlines (for some
682 # block gamut matches).
683 self.footnotes[normed_id] = text + "\n\n"
686 def _strip_footnote_definitions(self, text):
687 """A footnote definition looks like this:
689 [^note-id]: Text of the note.
691 May include one or more indented paragraphs.
694 - The 'note-id' can be pretty much anything, though typically it
695 is the number of the footnote.
696 - The first paragraph may start on the next line, like so:
701 less_than_tab = self.tab_width - 1
702 footnote_def_re = re.compile(r'''
703 ^[ ]{0,%d}\[\^(.+)\]: # id = \1
705 ( # footnote text = \2
706 # First line need not start with the spaces.
709 (?:[ ]{%d} | \t) # Subsequent lines must be indented.
713 # Lookahead for non-space at line-start, or end of doc.
714 (?:(?=^[ ]{0,%d}\S)|\Z)
715 ''' % (less_than_tab, self.tab_width, self.tab_width),
717 return footnote_def_re.sub(self._extract_footnote_def_sub, text)
721 ('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)),
722 ('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)),
723 ('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)),
726 def _run_block_gamut(self, text):
727 # These are all the transformations that form block-level
728 # tags like paragraphs, headers, and list items.
730 text = self._do_headers(text)
732 # Do Horizontal Rules:
733 # On the number of spaces in horizontal rules: The spec is fuzzy: "If
734 # you wish, you may use spaces between the hyphens or asterisks."
735 # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
736 # hr chars to one or two. We'll reproduce that limit here.
737 hr = "\n<hr"+self.empty_element_suffix+"\n"
738 for ch, regex in self._hr_data:
740 for m in reversed(list(regex.finditer(text))):
741 tail = m.group(1).rstrip()
742 if not tail.strip(ch + ' ') and tail.count(" ") == 0:
743 start, end = m.span()
744 text = text[:start] + hr + text[end:]
746 text = self._do_lists(text)
748 if "pyshell" in self.extras:
749 text = self._prepare_pyshell_blocks(text)
750 if "wiki-tables" in self.extras:
751 text = self._do_wiki_tables(text)
753 text = self._do_code_blocks(text)
755 text = self._do_block_quotes(text)
757 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
758 # was to escape raw HTML in the original Markdown source. This time,
759 # we're escaping the markup we've just created, so that we don't wrap
760 # <p> tags around block-level tags.
761 text = self._hash_html_blocks(text)
763 text = self._form_paragraphs(text)
767 def _pyshell_block_sub(self, match):
768 lines = match.group(0).splitlines(0)
770 indent = ' ' * self.tab_width
771 s = ('\n' # separate from possible cuddled paragraph
772 + indent + ('\n'+indent).join(lines)
776 def _prepare_pyshell_blocks(self, text):
777 """Ensure that Python interactive shell sessions are put in
778 code blocks -- even if not properly indented.
780 if ">>>" not in text:
783 less_than_tab = self.tab_width - 1
784 _pyshell_block_re = re.compile(r"""
785 ^([ ]{0,%d})>>>[ ].*\n # first line
786 ^(\1.*\S+.*\n)* # any number of subsequent lines
787 ^\n # ends with a blank line
788 """ % less_than_tab, re.M | re.X)
790 return _pyshell_block_re.sub(self._pyshell_block_sub, text)
792 def _wiki_table_sub(self, match):
793 ttext = match.group(0).strip()
794 #print 'wiki table: %r' % match.group(0)
796 for line in ttext.splitlines(0):
797 line = line.strip()[2:-2].strip()
798 row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
801 hlines = ['<table>', '<tbody>']
806 hrow.append(self._run_span_gamut(cell))
809 hlines.append(''.join(hrow))
810 hlines += ['</tbody>', '</table>']
811 return '\n'.join(hlines) + '\n'
813 def _do_wiki_tables(self, text):
818 less_than_tab = self.tab_width - 1
819 wiki_table_re = re.compile(r'''
820 (?:(?<=\n\n)|\A\n?) # leading blank line
821 ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
822 (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
823 ''' % less_than_tab, re.M | re.X)
824 return wiki_table_re.sub(self._wiki_table_sub, text)
826 def _run_span_gamut(self, text):
827 # These are all the transformations that occur *within* block-level
828 # tags like paragraphs, headers, and list items.
830 text = self._do_code_spans(text)
832 text = self._escape_special_chars(text)
834 # Process anchor and image tags.
835 text = self._do_links(text)
837 # Make links out of things like `<http://example.com/>`
838 # Must come after _do_links(), because you can use < and >
839 # delimiters in inline links like [this](<url>).
840 text = self._do_auto_links(text)
842 if "link-patterns" in self.extras:
843 text = self._do_link_patterns(text)
845 text = self._encode_amps_and_angles(text)
847 text = self._do_italics_bold_underline_mono(text)
849 if "smarty-pants" in self.extras:
850 text = self._do_smart_punctuation(text)
853 text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
857 # "Sorta" because auto-links are identified as "tag" tokens.
858 _sorta_html_tokenize_re = re.compile(r"""
863 (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
866 # auto-link (e.g., <http://www.activestate.com/>)
871 <\?.*?\?> # processing instruction
875 def _escape_special_chars(self, text):
876 # Python markdown note: the HTML tokenization here differs from
877 # that in Markdown.pl, hence the behaviour for subtle cases can
878 # differ (I believe the tokenizer here does a better job because
879 # it isn't susceptible to unmatched '<' and '>' in HTML tags).
880 # Note, however, that '>' is not allowed in an auto-link URL
883 is_html_markup = False
884 for token in self._sorta_html_tokenize_re.split(text):
886 # Within tags/HTML-comments/auto-links, encode * and _
887 # so they don't conflict with their use in Markdown for
888 # italics and strong. We're replacing each such
889 # character with its corresponding MD5 checksum value;
890 # this is likely overkill, but it should prevent us from
891 # colliding with the escape values by accident.
892 escaped.append(token.replace('*', self._escape_table['*'])
893 .replace('_', self._escape_table['_']))
895 escaped.append(self._encode_backslash_escapes(token))
896 is_html_markup = not is_html_markup
897 return ''.join(escaped)
899 def _hash_html_spans(self, text):
900 # Used for safe_mode.
902 def _is_auto_link(s):
903 if ':' in s and self._auto_link_re.match(s):
905 elif '@' in s and self._auto_email_link_re.match(s):
910 is_html_markup = False
911 for token in self._sorta_html_tokenize_re.split(text):
912 if is_html_markup and not _is_auto_link(token):
913 sanitized = self._sanitize_html(token)
914 key = _hash_text(sanitized)
915 self.html_spans[key] = sanitized
919 is_html_markup = not is_html_markup
920 return ''.join(tokens)
922 def _unhash_html_spans(self, text):
923 for key, sanitized in self.html_spans.items():
924 text = text.replace(key, sanitized)
927 def _sanitize_html(self, s):
928 if self.safe_mode == "replace":
929 return self.html_removed_text
930 elif self.safe_mode == "escape":
936 for before, after in replacements:
937 s = s.replace(before, after)
940 raise MarkdownError("invalid value for 'safe_mode': %r (must be "
941 "'escape' or 'replace')" % self.safe_mode)
943 _tail_of_inline_link_re = re.compile(r'''
944 # Match tail of: [text](/url/) or [text](/url/ "title")
954 (['"]) # quote char = \3
957 )? # title is optional
960 _tail_of_reference_link_re = re.compile(r'''
961 # Match tail of: [text][id]
962 [ ]? # one optional space
963 (?:\n[ ]*)? # one optional newline followed by spaces
969 def _do_links(self, text):
970 """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
972 This is a combination of Markdown.pl's _DoAnchors() and
973 _DoImages(). They are done together because that simplified the
974 approach. It was necessary to use a different approach than
975 Markdown.pl because of the lack of atomic matching support in
976 Python's regex engine used in $g_nested_brackets.
978 MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
980 # `anchor_allowed_pos` is used to support img links inside
981 # anchors, but not anchors inside anchors. An anchor's start
982 # pos must be `>= anchor_allowed_pos`.
983 anchor_allowed_pos = 0
986 while True: # Handle the next link.
987 # The next '[' is the start of:
988 # - an inline anchor: [text](url "title")
989 # - a reference anchor: [text][id]
990 # - an inline img: ![text](url "title")
991 # - a reference img: ![text][id]
992 # - a footnote ref: [^id]
993 # (Only if 'footnotes' extra enabled)
994 # - a footnote defn: [^id]: ...
995 # (Only if 'footnotes' extra enabled) These have already
996 # been stripped in _strip_footnote_definitions() so no
997 # need to watch for them.
998 # - a link definition: [id]: url "title"
999 # These have already been stripped in
1000 # _strip_link_definitions() so no need to watch for them.
1001 # - not markup: [...anything else...
1003 start_idx = text.index('[', curr_pos)
1006 text_length = len(text)
1008 # Find the matching closing ']'.
1009 # Markdown.pl allows *matching* brackets in link text so we
1010 # will here too. Markdown.pl *doesn't* currently allow
1011 # matching brackets in img alt text -- we'll differ in that
1014 for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1019 if bracket_depth < 0:
1024 # Closing bracket not found within sentinel length.
1025 # This isn't markup.
1026 curr_pos = start_idx + 1
1028 link_text = text[start_idx+1:p]
1030 # Possibly a footnote ref?
1031 if "footnotes" in self.extras and link_text.startswith("^"):
1032 normed_id = re.sub(r'\W', '-', link_text[1:])
1033 if normed_id in self.footnotes:
1034 self.footnote_ids.append(normed_id)
1035 result = '<sup class="footnote-ref" id="fnref-%s">' \
1036 '<a href="#fn-%s">%s</a></sup>' \
1037 % (normed_id, normed_id, len(self.footnote_ids))
1038 text = text[:start_idx] + result + text[p+1:]
1040 # This id isn't defined, leave the markup alone.
1044 # Now determine what this is by the remainder.
1046 if p == text_length:
1049 # Inline anchor or img?
1050 if text[p] == '(': # attempt at perf improvement
1051 match = self._tail_of_inline_link_re.match(text, p)
1053 # Handle an inline anchor or img.
1054 is_img = start_idx > 0 and text[start_idx-1] == "!"
1058 url, title = match.group("url"), match.group("title")
1059 if url and url[0] == '<':
1060 url = url[1:-1] # '<url>' -> 'url'
1061 # We've got to encode these to avoid conflicting
1062 # with italics/bold.
1063 url = url.replace('*', self._escape_table['*']) \
1064 .replace('_', self._escape_table['_'])
1066 title_str = ' title="%s"' % (
1067 _xml_escape_attr(title)
1068 .replace('*', self._escape_table['*'])
1069 .replace('_', self._escape_table['_']))
1073 result = '<img src="%s" alt="%s"%s%s' \
1074 % (url.replace('"', '"'),
1075 _xml_escape_attr(link_text),
1076 title_str, self.empty_element_suffix)
1077 curr_pos = start_idx + len(result)
1078 text = text[:start_idx] + result + text[match.end():]
1079 elif start_idx >= anchor_allowed_pos:
1080 result_head = '<a href="%s"%s>' % (url, title_str)
1081 result = '%s%s</a>' % (result_head, link_text)
1082 # <img> allowed from curr_pos on, <a> from
1083 # anchor_allowed_pos on.
1084 curr_pos = start_idx + len(result_head)
1085 anchor_allowed_pos = start_idx + len(result)
1086 text = text[:start_idx] + result + text[match.end():]
1088 # Anchor not allowed here.
1089 curr_pos = start_idx + 1
1092 # Reference anchor or img?
1094 match = self._tail_of_reference_link_re.match(text, p)
1096 # Handle a reference-style anchor or img.
1097 is_img = start_idx > 0 and text[start_idx-1] == "!"
1100 link_id = match.group("id").lower()
1102 link_id = link_text.lower() # for links like [this][]
1103 if link_id in self.urls:
1104 url = self.urls[link_id]
1105 # We've got to encode these to avoid conflicting
1106 # with italics/bold.
1107 url = url.replace('*', self._escape_table['*']) \
1108 .replace('_', self._escape_table['_'])
1109 title = self.titles.get(link_id)
1112 title = _xml_escape_attr(title) \
1113 .replace('*', self._escape_table['*']) \
1114 .replace('_', self._escape_table['_'])
1115 title_str = ' title="%s"' % title
1119 result = '<img src="%s" alt="%s"%s%s' \
1120 % (url.replace('"', '"'),
1121 link_text.replace('"', '"'),
1122 title_str, self.empty_element_suffix)
1123 curr_pos = start_idx + len(result)
1124 text = text[:start_idx] + result + text[match.end():]
1125 elif start_idx >= anchor_allowed_pos:
1126 result = '<a href="%s"%s>%s</a>' \
1127 % (url, title_str, link_text)
1128 result_head = '<a href="%s"%s>' % (url, title_str)
1129 result = '%s%s</a>' % (result_head, link_text)
1130 # <img> allowed from curr_pos on, <a> from
1131 # anchor_allowed_pos on.
1132 curr_pos = start_idx + len(result_head)
1133 anchor_allowed_pos = start_idx + len(result)
1134 text = text[:start_idx] + result + text[match.end():]
1136 # Anchor not allowed here.
1137 curr_pos = start_idx + 1
1139 # This id isn't defined, leave the markup alone.
1140 curr_pos = match.end()
1143 # Otherwise, it isn't markup.
1144 curr_pos = start_idx + 1
1148 def header_id_from_text(self, text, prefix, n):
1149 """Generate a header id attribute value from the given header
1152 This is only called if the "header-ids" extra is enabled.
1153 Subclasses may override this for different header ids.
1155 @param text {str} The text of the header tag
1156 @param prefix {str} The requested prefix for header ids. This is the
1157 value of the "header-ids" extra key, if any. Otherwise, None.
1158 @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1159 @returns {str} The value for the header tag's "id" attribute. Return
1160 None to not have an id attribute and to exclude this header from
1161 the TOC (if the "toc" extra is specified).
1163 header_id = _slugify(text)
1164 if prefix and isinstance(prefix, basestring):
1165 header_id = prefix + '-' + header_id
1166 if header_id in self._count_from_header_id:
1167 self._count_from_header_id[header_id] += 1
1168 header_id += '-%s' % self._count_from_header_id[header_id]
1170 self._count_from_header_id[header_id] = 1
1174 def _toc_add_entry(self, level, id, name):
1175 if self._toc is None:
1177 self._toc.append((level, id, name))
1179 _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1180 def _setext_h_sub(self, match):
1181 n = {"=": 1, "-": 2}[match.group(2)[0]]
1182 demote_headers = self.extras.get("demote-headers")
1184 n = min(n + demote_headers, 6)
1186 if "header-ids" in self.extras:
1187 header_id = self.header_id_from_text(match.group(1),
1188 self.extras["header-ids"], n)
1190 header_id_attr = ' id="%s"' % header_id
1191 html = self._run_span_gamut(match.group(1))
1192 if "toc" in self.extras and header_id:
1193 self._toc_add_entry(n, header_id, html)
1194 return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1196 _atx_h_re = re.compile(r'''
1197 ^(\={1,6}) # \1 = string of ='s
1199 (.+?) # \2 = Header text
1201 (?<!\\) # ensure not an escaped trailing '#'
1202 \=* # optional closing #'s (not counted)
1205 def _atx_h_sub(self, match):
1206 n = len(match.group(1))
1207 demote_headers = self.extras.get("demote-headers")
1209 n = min(n + demote_headers, 6)
1211 if "header-ids" in self.extras:
1212 header_id = self.header_id_from_text(match.group(2),
1213 self.extras["header-ids"], n)
1215 header_id_attr = ' id="%s"' % header_id
1216 html = self._run_span_gamut(match.group(2))
1217 if "toc" in self.extras and header_id:
1218 self._toc_add_entry(n, header_id, html)
1219 return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1221 def _do_headers(self, text):
1222 # Setext-style headers:
1228 # zarvox: nuke this nonsense
1229 # text = self._setext_h_re.sub(self._setext_h_sub, text)
1231 # atx-style headers:
1234 # ## Header 2 with closing hashes ##
1237 text = self._atx_h_re.sub(self._atx_h_sub, text)
1242 _marker_ul_chars = '*'
1243 _marker_ol_chars = '-#'
1244 _marker_any = r'(?:[%s]|[%s])' % (_marker_ul_chars, _marker_ol_chars)
1245 _marker_ul = '(?:[%s])' % _marker_ul_chars
1246 _marker_ol = r'(?:[%s])' % _marker_ol_chars
1248 def _list_sub(self, match):
1249 lst = match.group(1)
1250 lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1251 result = self._process_list_items(lst)
1253 return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1255 return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1257 def _do_lists(self, text):
1258 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1260 for marker_pat in (self._marker_ul, self._marker_ol):
1261 # Re-usable pattern to match any entire ul or ol list:
1262 less_than_tab = self.tab_width - 1
1267 (%s) # \3 = first list item marker
1269 (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1277 (?! # Negative lookahead for another list item marker
1283 ''' % (less_than_tab, marker_pat, marker_pat)
1285 # We use a different prefix before nested lists than top-level lists.
1286 # See extended comment in _process_list_items().
1288 # Note: There's a bit of duplication here. My original implementation
1289 # created a scalar regex pattern as the conditional result of the test on
1290 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1291 # substitution once, using the scalar as the pattern. This worked,
1292 # everywhere except when running under MT on my hosting account at Pair
1293 # Networks. There, this caused all rebuilds to be killed by the reaper (or
1294 # perhaps they crashed, but that seems incredibly unlikely given that the
1295 # same script on the same server ran fine *except* under MT. I've spent
1296 # more time trying to figure out why this is happening than I'd like to
1297 # admit. My only guess, backed up by the fact that this workaround works,
1298 # is that Perl optimizes the substition when it can figure out that the
1299 # pattern will never change, and when this optimization isn't on, we run
1300 # afoul of the reaper. Thus, the slightly redundant code to that uses two
1301 # static s/// patterns rather than one conditional pattern.
1304 sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1305 text = sub_list_re.sub(self._list_sub, text)
1307 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1309 text = list_re.sub(self._list_sub, text)
1313 _list_item_re = re.compile(r'''
1314 (\n)? # leading line = \1
1315 (^[ \t]*) # leading whitespace = \2
1316 (?P<marker>%s) [ \t]+ # list marker = \3
1317 ((?:.+?) # list item text = \4
1318 (\n{1,2})) # eols = \5
1319 (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
1320 ''' % (_marker_any, _marker_any),
1323 _last_li_endswith_two_eols = False
1324 def _list_item_sub(self, match):
1325 item = match.group(4)
1326 leading_line = match.group(1)
1327 leading_space = match.group(2)
1328 if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1329 item = self._run_block_gamut(self._outdent(item))
1331 # Recursion for sub-lists:
1332 item = self._do_lists(self._outdent(item))
1333 if item.endswith('\n'):
1335 item = self._run_span_gamut(item)
1336 self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1337 return "<li>%s</li>\n" % item
1339 def _process_list_items(self, list_str):
1340 # Process the contents of a single ordered or unordered list,
1341 # splitting it into individual list items.
1343 # The $g_list_level global keeps track of when we're inside a list.
1344 # Each time we enter a list, we increment it; when we leave a list,
1345 # we decrement. If it's zero, we're not in a list anymore.
1347 # We do this because when we're not inside a list, we want to treat
1348 # something like this:
1350 # I recommend upgrading to version
1351 # 8. Oops, now this line is treated
1354 # As a single paragraph, despite the fact that the second line starts
1355 # with a digit-period-space sequence.
1357 # Whereas when we're inside a list (or sub-list), that line will be
1358 # treated as the start of a sub-list. What a kludge, huh? This is
1359 # an aspect of Markdown's syntax that's hard to parse perfectly
1360 # without resorting to mind-reading. Perhaps the solution is to
1361 # change the syntax rules such that sub-lists must start with a
1362 # starting cardinal number; e.g. "1." or "a.".
1363 self.list_level += 1
1364 self._last_li_endswith_two_eols = False
1365 list_str = list_str.rstrip('\n') + '\n'
1366 list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1367 self.list_level -= 1
1370 def _get_pygments_lexer(self, lexer_name):
1372 from pygments import lexers, util
1376 return lexers.get_lexer_by_name(lexer_name)
1377 except util.ClassNotFound:
1380 def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1382 import pygments.formatters
1384 class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1385 def _wrap_code(self, inner):
1386 """A function for use in a Pygments Formatter which
1387 wraps in <code> tags.
1394 def wrap(self, source, outfile):
1395 """Return the source with a code, pre, and div."""
1396 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1398 formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts)
1399 return pygments.highlight(codeblock, lexer, formatter)
1401 def _code_block_sub(self, match):
1402 codeblock = match.group(1)
1403 codeblock = self._outdent(codeblock)
1404 codeblock = self._detab(codeblock)
1405 codeblock = codeblock.lstrip('\n') # trim leading newlines
1406 codeblock = codeblock.rstrip() # trim trailing whitespace
1408 if "code-color" in self.extras and codeblock.startswith(":::"):
1409 lexer_name, rest = codeblock.split('\n', 1)
1410 lexer_name = lexer_name[3:].strip()
1411 lexer = self._get_pygments_lexer(lexer_name)
1412 codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1414 formatter_opts = self.extras['code-color'] or {}
1415 colored = self._color_with_pygments(codeblock, lexer,
1417 return "\n\n%s\n\n" % colored
1419 codeblock = self._encode_code(codeblock)
1420 pre_class_str = self._html_class_str_from_tag("pre")
1421 code_class_str = self._html_class_str_from_tag("code")
1422 return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
1423 pre_class_str, code_class_str, codeblock)
1425 def _html_class_str_from_tag(self, tag):
1426 """Get the appropriate ' class="..."' string (note the leading
1427 space), if any, for the given tag.
1429 if "html-classes" not in self.extras:
1432 html_classes_from_tag = self.extras["html-classes"]
1436 if tag in html_classes_from_tag:
1437 return ' class="%s"' % html_classes_from_tag[tag]
1440 def _do_code_blocks(self, text):
1441 """Process Markdown `<pre><code>` blocks."""
1442 code_block_re = re.compile(r'''
1444 ( # $1 = the code block -- one or more lines, starting with a space/tab
1446 (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1450 ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1451 ''' % (self.tab_width, self.tab_width),
1454 return code_block_re.sub(self._code_block_sub, text)
1457 # Rules for a code span:
1458 # - backslash escapes are not interpreted in a code span
1459 # - to include one or or a run of more backticks the delimiters must
1460 # be a longer run of backticks
1461 # - cannot start or end a code span with a backtick; pad with a
1462 # space and that space will be removed in the emitted HTML
1463 # See `test/tm-cases/escapes.text` for a number of edge-case
1465 _code_span_re = re.compile(r'''
1467 (`+) # \1 = Opening run of `
1468 (?!`) # See Note A test/tm-cases/escapes.text
1469 (.+?) # \2 = The code block
1471 \1 # Matching closer
1475 def _code_span_sub(self, match):
1476 c = match.group(2).strip(" \t")
1477 c = self._encode_code(c)
1478 return "<code>%s</code>" % c
1480 def _do_code_spans(self, text):
1481 # * Backtick quotes are used for <code></code> spans.
1483 # * You can use multiple backticks as the delimiters if you want to
1484 # include literal backticks in the code span. So, this input:
1486 # Just type ``foo `bar` baz`` at the prompt.
1488 # Will translate to:
1490 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1492 # There's no arbitrary limit to the number of backticks you
1493 # can use as delimters. If you need three consecutive backticks
1494 # in your code, use four for delimiters, etc.
1496 # * You can use spaces to get literal backticks at the edges:
1498 # ... type `` `bar` `` ...
1502 # ... type <code>`bar`</code> ...
1503 return self._code_span_re.sub(self._code_span_sub, text)
1505 def _encode_code(self, text):
1506 """Encode/escape certain characters inside Markdown code runs.
1507 The point is that in code, these characters are literals,
1508 and lose their special Markdown meanings.
1511 # Encode all ampersands; HTML entities are not
1512 # entities within a Markdown code span.
1514 # Do the angle bracket song and dance:
1517 # Now, escape characters that are magic in Markdown:
1518 ('*', self._escape_table['*']),
1519 ('_', self._escape_table['_']),
1520 ('{', self._escape_table['{']),
1521 ('}', self._escape_table['}']),
1522 ('[', self._escape_table['[']),
1523 (']', self._escape_table[']']),
1524 ('\\', self._escape_table['\\']),
1526 for before, after in replacements:
1527 text = text.replace(before, after)
1530 _strong_re = re.compile(r"(\*\*)(?=\S)(.+?[*]*)(?<=\S)\*\*", re.S)
1531 _em_re = re.compile(r"(?<!:)(\/\/)(?=\S)(.+?)(?<![\t\n\r\f\v:])\/\/", re.S)
1532 _underline_re = re.compile(r"(__)(?=\S)(.+?)(?<=\S)__", re.S)
1533 _monospace_re = re.compile(r"(\'\')(?=\S)(.+?)(?<=\S)\'\'", re.S)
1534 def _do_italics_bold_underline_mono(self, text):
1535 text = self._strong_re.sub(r"<strong>\2</strong>", text)
1536 text = self._em_re.sub(r"<em>\2</em>", text)
1537 text = self._underline_re.sub(r"<span style='text-decoration:underline;'>\2</span>", text)
1538 text = self._monospace_re.sub(r"<span style='font-family:monospace;'>\2</span>", text)
1541 # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1542 # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1543 # "twixt" can be written without an initial apostrophe. This is fine because
1544 # using scare quotes (single quotation marks) is rare.
1545 _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1546 _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1547 "round", "bout", "twixt", "nuff", "fraid", "sup"]
1548 def _do_smart_contractions(self, text):
1549 text = self._apostrophe_year_re.sub(r"’\1", text)
1550 for c in self._contractions:
1551 text = text.replace("'%s" % c, "’%s" % c)
1552 text = text.replace("'%s" % c.capitalize(),
1553 "’%s" % c.capitalize())
1556 # Substitute double-quotes before single-quotes.
1557 _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
1558 _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
1559 _closing_single_quote_re = re.compile(r"(?<=\S)'")
1560 _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
1561 def _do_smart_punctuation(self, text):
1562 """Fancifies 'single quotes', "double quotes", and apostrophes.
1563 Converts --, ---, and ... into en dashes, em dashes, and ellipses.
1565 Inspiration is: <http://daringfireball.net/projects/smartypants/>
1566 See "test/tm-cases/smarty_pants.text" for a full discussion of the
1568 <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
1569 discussion of some diversion from the original SmartyPants.
1571 if "'" in text: # guard for perf
1572 text = self._do_smart_contractions(text)
1573 text = self._opening_single_quote_re.sub("‘", text)
1574 text = self._closing_single_quote_re.sub("’", text)
1576 if '"' in text: # guard for perf
1577 text = self._opening_double_quote_re.sub("“", text)
1578 text = self._closing_double_quote_re.sub("”", text)
1580 text = text.replace("---", "—")
1581 text = text.replace("--", "–")
1582 text = text.replace("...", "…")
1583 text = text.replace(" . . . ", "…")
1584 text = text.replace(". . .", "…")
1587 _block_quote_re = re.compile(r'''
1588 ( # Wrap whole match in \1
1590 ^[ \t]*>[ \t]? # '>' at the start of a line
1591 .+\n # rest of the first line
1592 (.+\n)* # subsequent consecutive lines
1597 _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1599 _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1600 def _dedent_two_spaces_sub(self, match):
1601 return re.sub(r'(?m)^ ', '', match.group(1))
1603 def _block_quote_sub(self, match):
1605 bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1606 bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1607 bq = self._run_block_gamut(bq) # recurse
1609 bq = re.sub('(?m)^', ' ', bq)
1610 # These leading spaces screw with <pre> content, so we need to fix that:
1611 bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1613 return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1615 def _do_block_quotes(self, text):
1618 return self._block_quote_re.sub(self._block_quote_sub, text)
1620 def _form_paragraphs(self, text):
1621 # Strip leading and trailing lines:
1622 text = text.strip('\n')
1626 for i, graf in enumerate(re.split(r"\n{2,}", text)):
1627 if graf in self.html_blocks:
1628 # Unhashify HTML blocks
1629 grafs.append(self.html_blocks[graf])
1632 if "cuddled-lists" in self.extras:
1633 # Need to put back trailing '\n' for `_list_item_re`
1634 # match at the end of the paragraph.
1635 li = self._list_item_re.search(graf + '\n')
1636 # Two of the same list marker in this paragraph: a likely
1637 # candidate for a list cuddled to preceding paragraph
1638 # text (issue 33). Note the `[-1]` is a quick way to
1639 # consider numeric bullets (e.g. "1." and "2.") to be
1641 if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1642 and li.group("marker")[-1] == li.group("next_marker")[-1]):
1644 cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1645 assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
1649 graf = self._run_span_gamut(graf)
1650 grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
1653 grafs.append(cuddled_list)
1655 return "\n\n".join(grafs)
1657 def _add_footnotes(self, text):
1660 '<div class="footnotes">',
1661 '<hr' + self.empty_element_suffix,
1664 for i, id in enumerate(self.footnote_ids):
1667 footer.append('<li id="fn-%s">' % id)
1668 footer.append(self._run_block_gamut(self.footnotes[id]))
1669 backlink = ('<a href="#fnref-%s" '
1670 'class="footnoteBackLink" '
1671 'title="Jump back to footnote %d in the text.">'
1672 '↩</a>' % (id, i+1))
1673 if footer[-1].endswith("</p>"):
1674 footer[-1] = footer[-1][:-len("</p>")] \
1675 + ' ' + backlink + "</p>"
1677 footer.append("\n<p>%s</p>" % backlink)
1678 footer.append('</li>')
1679 footer.append('</ol>')
1680 footer.append('</div>')
1681 return text + '\n\n' + '\n'.join(footer)
1685 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1686 # http://bumppo.net/projects/amputator/
1687 _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1688 _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1689 _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I)
1691 def _encode_amps_and_angles(self, text):
1692 # Smart processing for ampersands and angle brackets that need
1694 text = self._ampersand_re.sub('&', text)
1697 text = self._naked_lt_re.sub('<', text)
1700 # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1701 # Markdown) don't do this.
1702 text = self._naked_gt_re.sub('>', text)
1705 def _encode_backslash_escapes(self, text):
1706 for ch, escape in self._escape_table.items():
1707 text = text.replace("\\"+ch, escape)
1710 _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1711 def _auto_link_sub(self, match):
1713 return '<a href="%s">%s</a>' % (g1, g1)
1715 _auto_email_link_re = re.compile(r"""
1721 [-\w]+(\.[-\w]+)*\.[a-z]+
1724 """, re.I | re.X | re.U)
1725 def _auto_email_link_sub(self, match):
1726 return self._encode_email_address(
1727 self._unescape_special_chars(match.group(1)))
1729 def _do_auto_links(self, text):
1730 text = self._auto_link_re.sub(self._auto_link_sub, text)
1731 text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1734 def _encode_email_address(self, addr):
1735 # Input: an email address, e.g. "foo@example.com"
1737 # Output: the email address as a mailto link, with each character
1738 # of the address encoded as either a decimal or hex entity, in
1739 # the hopes of foiling most address harvesting spam bots. E.g.:
1741 # <a href="mailto:foo@e
1742 # xample.com">foo
1743 # @example.com</a>
1745 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1746 # mailing list: <http://tinyurl.com/yu7ue>
1747 chars = [_xml_encode_email_char_at_random(ch)
1748 for ch in "mailto:" + addr]
1749 # Strip the mailto: from the visible part.
1750 addr = '<a href="%s">%s</a>' \
1751 % (''.join(chars), ''.join(chars[7:]))
1754 def _do_link_patterns(self, text):
1755 """Caveat emptor: there isn't much guarding against link
1756 patterns being formed inside other standard Markdown links, e.g.
1757 inside a [link def][like this].
1759 Dev Notes: *Could* consider prefixing regexes with a negative
1760 lookbehind assertion to attempt to guard against this.
1763 for regex, repl in self.link_patterns:
1765 for match in regex.finditer(text):
1766 if hasattr(repl, "__call__"):
1769 href = match.expand(repl)
1770 replacements.append((match.span(), href))
1771 for (start, end), href in reversed(replacements):
1773 href.replace('"', '"') # b/c of attr quote
1774 # To avoid markdown <em> and <strong>:
1775 .replace('*', self._escape_table['*'])
1776 .replace('_', self._escape_table['_']))
1777 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1778 hash = _hash_text(link)
1779 link_from_hash[hash] = link
1780 text = text[:start] + hash + text[end:]
1781 for hash, link in link_from_hash.items():
1782 text = text.replace(hash, link)
1785 def _unescape_special_chars(self, text):
1786 # Swap back in all the special characters we've hidden.
1787 for ch, hash in self._escape_table.items():
1788 text = text.replace(hash, ch)
1791 def _outdent(self, text):
1792 # Remove one level of line-leading tabs or spaces
1793 return self._outdent_re.sub('', text)
1796 class MarkdownWithExtras(Markdown):
1797 """A markdowner class that enables most extras:
1800 - code-color (only has effect if 'pygments' Python module on path)
1802 These are not included:
1803 - pyshell (specific to Python-related documenting)
1804 - code-friendly (because it *disables* part of the syntax)
1805 - link-patterns (because you need to specify some actual
1806 link-patterns anyway)
1808 extras = ["footnotes", "code-color"]
1811 #---- internal support functions
1813 class UnicodeWithAttrs(str):
1814 """A subclass of unicode used for the return value of conversion to
1815 possibly attach some attributes. E.g. the "toc_html" attribute when
1816 the "toc" extra is used.
1821 """Return the HTML for the current TOC.
1823 This expects the `_toc` attribute to have been set on this instance.
1825 if self._toc is None:
1829 return ' ' * (len(h_stack) - 1)
1831 h_stack = [0] # stack of header-level numbers
1832 for level, id, name in self._toc:
1833 if level > h_stack[-1]:
1834 lines.append("%s<ul>" % indent())
1835 h_stack.append(level)
1836 elif level == h_stack[-1]:
1837 lines[-1] += "</li>"
1839 while level < h_stack[-1]:
1841 if not lines[-1].endswith("</li>"):
1842 lines[-1] += "</li>"
1843 lines.append("%s</ul></li>" % indent())
1844 lines.append(u'%s<li><a href="#%s">%s</a>' % (
1845 indent(), id, name))
1846 while len(h_stack) > 1:
1848 if not lines[-1].endswith("</li>"):
1849 lines[-1] += "</li>"
1850 lines.append("%s</ul>" % indent())
1851 return '\n'.join(lines) + '\n'
1854 ## {{{ http://code.activestate.com/recipes/577257/ (r1)
1855 _slugify_strip_re = re.compile(r'[^\w\s-]')
1856 _slugify_hyphenate_re = re.compile(r'[-\s]+')
1857 def _slugify(value):
1859 Normalizes string, converts to lowercase, removes non-alpha characters,
1860 and converts spaces to hyphens.
1862 From Django's "django/template/defaultfilters.py".
1865 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
1866 value = unicode(_slugify_strip_re.sub('', value).strip().lower())
1867 return _slugify_hyphenate_re.sub('-', value)
1868 ## end of http://code.activestate.com/recipes/577257/ }}}
1871 # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1872 def _curry(*args, **kwargs):
1873 function, args = args[0], args[1:]
1874 def result(*rest, **kwrest):
1875 combined = kwargs.copy()
1876 combined.update(kwrest)
1877 return function(*args + rest, **combined)
1880 # Recipe: regex_from_encoded_pattern (1.0)
1881 def _regex_from_encoded_pattern(s):
1882 """'foo' -> re.compile(re.escape('foo'))
1883 '/foo/' -> re.compile('foo')
1884 '/foo/i' -> re.compile('foo', re.I)
1886 if s.startswith('/') and s.rfind('/') != 0:
1887 # Parse it: /PATTERN/FLAGS
1889 pattern, flags_str = s[1:idx], s[idx+1:]
1898 for char in flags_str:
1900 flags |= flag_from_char[char]
1902 raise ValueError("unsupported regex flag: '%s' in '%s' "
1903 "(must be one of '%s')"
1904 % (char, s, ''.join(flag_from_char.keys())))
1905 return re.compile(s[1:idx], flags)
1906 else: # not an encoded regex
1907 return re.compile(re.escape(s))
1909 # Recipe: dedent (0.1.2)
1910 def _dedentlines(lines, tabsize=8, skip_first_line=False):
1911 """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1913 "lines" is a list of lines to dedent.
1914 "tabsize" is the tab width to use for indent width calculations.
1915 "skip_first_line" is a boolean indicating if the first line should
1916 be skipped for calculating the indent width and for dedenting.
1917 This is sometimes useful for docstrings and similar.
1919 Same as dedent() except operates on a sequence of lines. Note: the
1920 lines list is modified **in-place**.
1924 print(f"dedent: dedent(..., tabsize={tabsize}, skip_first_line={skip_first_line})")
1927 for i, line in enumerate(lines):
1928 if i == 0 and skip_first_line: continue
1934 indent += tabsize - (indent % tabsize)
1936 continue # skip all-whitespace lines
1940 continue # skip all-whitespace lines
1941 if DEBUG: print(f"dedent: indent={indent}: {line}")
1945 margin = min(margin, indent)
1946 if DEBUG: print(f"dedent: margin={margin}")
1948 if margin is not None and margin > 0:
1949 for i, line in enumerate(lines):
1950 if i == 0 and skip_first_line: continue
1952 for j, ch in enumerate(line):
1956 removed += tabsize - (removed % tabsize)
1958 if DEBUG: print("dedent: {line}: EOL -> strip up to EOL")
1959 lines[i] = lines[i][j:]
1962 raise ValueError("unexpected non-whitespace char %r in "
1963 "line %r while removing %d-space margin"
1964 % (ch, line, margin))
1966 print(f"dedent: {line}: {ch} -> removed {removed}/{margin}")
1967 if removed == margin:
1968 lines[i] = lines[i][j+1:]
1970 elif removed > margin:
1971 lines[i] = ' '*(removed-margin) + lines[i][j+1:]
1975 lines[i] = lines[i][removed:]
1978 def _dedent(text, tabsize=8, skip_first_line=False):
1979 """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
1981 "text" is the text to dedent.
1982 "tabsize" is the tab width to use for indent width calculations.
1983 "skip_first_line" is a boolean indicating if the first line should
1984 be skipped for calculating the indent width and for dedenting.
1985 This is sometimes useful for docstrings and similar.
1987 textwrap.dedent(s), but don't expand tabs to spaces
1989 lines = text.splitlines(1)
1990 _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
1991 return ''.join(lines)
1994 class _memoized(object):
1995 """Decorator that caches a function's return value each time it is called.
1996 If called later with the same arguments, the cached value is returned, and
1999 http://wiki.python.org/moin/PythonDecoratorLibrary
2001 def __init__(self, func):
2004 def __call__(self, *args):
2006 return self.cache[args]
2008 self.cache[args] = value = self.func(*args)
2011 # uncachable -- for instance, passing a list as an argument.
2012 # Better to not cache than to blow up entirely.
2013 return self.func(*args)
2015 """Return the function's docstring."""
2016 return self.func.__doc__
2019 def _xml_oneliner_re_from_tab_width(tab_width):
2020 """Standalone XML processing instruction regex."""
2021 return re.compile(r"""
2023 (?<=\n\n) # Starting after a blank line
2025 \A\n? # the beginning of the doc
2030 <\?\w+\b\s+.*?\?> # XML processing instruction
2032 <\w+:\w+\b\s+.*?/> # namespaced single tag
2035 (?=\n{2,}|\Z) # followed by a blank line or end of document
2037 """ % (tab_width - 1), re.X)
2038 _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2040 def _hr_tag_re_from_tab_width(tab_width):
2041 return re.compile(r"""
2043 (?<=\n\n) # Starting after a blank line
2045 \A\n? # the beginning of the doc
2049 <(hr) # start tag = \2
2052 /?> # the matching end tag
2054 (?=\n{2,}|\Z) # followed by a blank line or end of document
2056 """ % (tab_width - 1), re.X)
2057 _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2060 def _xml_escape_attr(attr, skip_single_quote=True):
2061 """Escape the given string for use in an HTML/XML tag attribute.
2063 By default this doesn't bother with escaping `'` to `'`, presuming that
2064 the tag attribute is surrounded by double quotes.
2067 .replace('&', '&')
2068 .replace('"', '"')
2069 .replace('<', '<')
2070 .replace('>', '>'))
2071 if not skip_single_quote:
2072 escaped = escaped.replace("'", "'")
2076 def _xml_encode_email_char_at_random(ch):
2078 # Roughly 10% raw, 45% hex, 45% dec.
2079 # '@' *must* be encoded. I [John Gruber] insist.
2080 # Issue 26: '_' must be encoded.
2081 if r > 0.9 and ch not in "@_":
2084 # The [1:] is to drop leading '0': 0x63 -> x63
2085 return '&#%s;' % hex(ord(ch))[1:]
2087 return '&#%s;' % ord(ch)
2093 class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2094 """An optparse formatter that does NOT reflow the description."""
2095 def format_description(self, description):
2096 return description or ""
2102 def main(argv=None):
2105 if not logging.root.handlers:
2106 logging.basicConfig()
2108 usage = "usage: %prog [PATHS...]"
2109 version = "%prog "+__version__
2110 parser = optparse.OptionParser(prog="markdown2", usage=usage,
2111 version=version, description=cmdln_desc,
2112 formatter=_NoReflowFormatter())
2113 parser.add_option("-v", "--verbose", dest="log_level",
2114 action="store_const", const=logging.DEBUG,
2115 help="more verbose output")
2116 parser.add_option("--encoding",
2117 help="specify encoding of text content")
2118 parser.add_option("--html4tags", action="store_true", default=False,
2119 help="use HTML 4 style for empty element tags")
2120 parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2121 help="sanitize literal HTML: 'escape' escapes "
2122 "HTML meta chars, 'replace' replaces with an "
2123 "[HTML_REMOVED] note")
2124 parser.add_option("-x", "--extras", action="append",
2125 help="Turn on specific extra features (not part of "
2126 "the core Markdown spec). See above.")
2127 parser.add_option("--use-file-vars",
2128 help="Look for and use Emacs-style 'markdown-extras' "
2129 "file var to turn on extras. See "
2130 "<https://github.com/trentm/python-markdown2/wiki/Extras>")
2131 parser.add_option("--link-patterns-file",
2132 help="path to a link pattern file")
2133 parser.add_option("--self-test", action="store_true",
2134 help="run internal self-tests (some doctests)")
2135 parser.add_option("--compare", action="store_true",
2136 help="run against Markdown.pl as well (for testing)")
2137 parser.set_defaults(log_level=logging.INFO, compare=False,
2138 encoding="utf-8", safe_mode=None, use_file_vars=False)
2139 opts, paths = parser.parse_args()
2140 log.setLevel(opts.log_level)
2147 for s in opts.extras:
2148 splitter = re.compile("[,;: ]+")
2149 for e in splitter.split(s):
2151 ename, earg = e.split('=', 1)
2157 ename, earg = e, None
2158 extras[ename] = earg
2162 if opts.link_patterns_file:
2164 f = open(opts.link_patterns_file)
2166 for i, line in enumerate(f.readlines()):
2167 if not line.strip(): continue
2168 if line.lstrip().startswith("#"): continue
2170 pat, href = line.rstrip().rsplit(None, 1)
2172 raise MarkdownError("%s:%d: invalid link pattern line: %r"
2173 % (opts.link_patterns_file, i+1, line))
2174 link_patterns.append(
2175 (_regex_from_encoded_pattern(pat), href))
2179 link_patterns = None
2181 from os.path import join, dirname, abspath, exists
2182 markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2186 print("==== Markdown.pl ====")
2187 perl_cmd = 'perl %s "%s"' % (markdown_pl, path)
2188 o = os.popen(perl_cmd)
2189 perl_html = o.read()
2191 sys.stdout.write(perl_html)
2192 print("==== markdown2.py ====")
2193 html = markdown_path(path, encoding=opts.encoding,
2194 html4tags=opts.html4tags,
2195 safe_mode=opts.safe_mode,
2196 extras=extras, link_patterns=link_patterns,
2197 use_file_vars=opts.use_file_vars)
2199 html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2200 if extras and "toc" in extras:
2201 log.debug("toc_html: " +
2202 html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2204 test_dir = join(dirname(dirname(abspath(__file__))), "test")
2205 if exists(join(test_dir, "test_markdown2.py")):
2206 sys.path.insert(0, test_dir)
2207 from test_markdown2 import norm_html_from_html
2208 norm_html = norm_html_from_html(html)
2209 norm_perl_html = norm_html_from_html(perl_html)
2212 norm_perl_html = perl_html
2213 did_match = (norm_perl_html == norm_html)
2214 print(f"==== match? {did_match} ====")
2217 if __name__ == "__main__":
2218 sys.exit( main(sys.argv) )