module MaRuKu::In::Markdown::BlockLevelParser
Public Instance Methods
count the actual number of elements in a row taking into account colspans
# File lib/maruku/input/parse_block.rb, line 653 def count_columns(row) colCount = 0 row.each do |cell| if cell.al && cell.al.size > 0 al = find_colspan(cell.al) if al != nil colCount += al[1].to_i else colCount += 1 end else colCount += 1 end end colCount end
Is the given element an HTML element whose root is not an inline element?
# File lib/maruku/input/parse_block.rb, line 324 def element_is_non_inline_html?(elem) if elem.is_a?(MDElement) && elem.node_type == :raw_html && elem.parsed_html first_node_name = elem.parsed_html.first_node_name first_node_name && !HTML_INLINE_ELEMS.include?(elem.parsed_html.first_node_name) else false end end
If current line is text, a definition list is coming if 1) text,empty,*,definition
# File lib/maruku/input/parse_block.rb, line 681 def eventually_comes_a_def_list(src) src.tell_me_the_future =~ %r{^t+e?d}x end
# File lib/maruku/input/parse_doc.rb, line 123 def execute_code_blocks each_element(:xml_instr) do |e| if e.target == 'maruku' result = safe_execute_code(e, e.code) if result.kind_of?(String) puts "Result is : #{result.inspect}" end end end end
Expands an attribute list in an Hash
# File lib/maruku/input/parse_doc.rb, line 72 def expand_attribute_list(al, result) al.each do |k, v| case k when :class if result[:class] result[:class] << " " << v else result[:class] = v end when :id result[:id] = v when :ref if self.ald[v] already = (result[:expanded_references] ||= []) if !already.include?(v) already << v expand_attribute_list(self.ald[v], result) else already << v maruku_error "Circular reference between labels.\n\n" + "Label #{v.inspect} calls itself via recursion.\nThe recursion is " + already.map(&:inspect).join(' => ') end else if result[:unresolved_references] result[:unresolved_references] << " " << v else result[:unresolved_references] = v end # $stderr.puts "Unresolved reference #{v.inspect} (avail: #{self.ald.keys.inspect})" result[v.to_sym] = true end else result[k.to_sym] = v end end end
Search an attribute list looking for a colspan
# File lib/maruku/input/parse_block.rb, line 675 def find_colspan(al) al.find {|alElem| alElem[0] == "colspan" } end
Input is a LineSource
# File lib/maruku/input/parse_block.rb, line 24 def parse_blocks(src) output = BlockContext.new # run state machine while src.cur_line next if check_block_extensions(src, output, src.cur_line) md_type = src.cur_line.md_type # Prints detected type (useful for debugging) #puts "parse_blocks #{md_type}|#{src.cur_line}" case md_type when :empty output << :empty src.ignore_line when :ial m = InlineAttributeList.match src.shift_line content = m[1] || "" src2 = CharSource.new(content, src) interpret_extension(src2, output) when :ald output << read_ald(src) when :text # paragraph, or table, or definition list read_text_material(src, output) when :header2, :hrule # hrule src.shift_line output << md_hrule when :header3 output << read_header3(src) when :ulist, :olist list_type = (md_type == :ulist) ? :ul : :ol li = read_list_item(src) # append to current list if we have one if output.last.kind_of?(MDElement) && output.last.node_type == list_type then output.last.children << li else output << md_el(list_type, li) end when :quote output << read_quote(src) when :code e = read_code(src) output << e if e when :raw_html # More extra hacky stuff - if there's more than just HTML, we either wrap it # in a paragraph or break it up depending on whether it's an inline element or not e = read_raw_html(src) unless e.empty? if e.first.parsed_html && (first_node_name = e.first.parsed_html.first_node_name) && HTML_INLINE_ELEMS.include?(first_node_name) && !%w(svg math).include?(first_node_name) content = [e.first] if e.size > 1 content.concat(e[1].children) end output << md_par(content) else output.concat(e) end end when :footnote_text output << read_footnote_text(src) when :ref_definition if src.parent && src.cur_index == 0 read_text_material(src, output) else read_ref_definition(src, output) end when :abbreviation output << read_abbreviation(src) when :xml_instr read_xml_instruction(src, output) else # unhandled line type at this level # Just treat it as raw text read_text_material(src, output) end end merge_ial(output, src, output) output.delete_if do |x| # Strip out IAL (x.kind_of?(MDElement) && x.node_type == :ial) || # get rid of empty line markers x == :empty end # See for each list if we can omit the paragraphs # TODO: do this after output.each do |c| # Remove paragraphs that we can get rid of if [:ul, :ol].include?(c.node_type) && c.children.none?(&:want_my_paragraph) c.children.each do |d| if d.children.first && d.children.first.node_type == :paragraph d.children = d.children.first.children + d.children[1..-1] end end elsif c.node_type == :definition_list && c.children.none?(&:want_my_paragraph) c.children.each do |definition| definition.definitions.each do |dd| if dd.children.first.node_type == :paragraph dd.children = dd.children.first.children + dd.children[1..-1] end end end end end output end
# File lib/maruku/input/parse_doc.rb, line 6 def parse_doc(s) # Remove BOM if it is present s = s.sub(/^\xEF\xBB\xBF/u, '') meta2 = parse_email_headers(s) data = meta2.delete :data self.attributes.merge! meta2 Attribute: encoding Scope: document Summary: Encoding for the document. If the `encoding` attribute is specified, then the content will be converted from the specified encoding to UTF-8. enc = self.attributes.delete(:encoding) || 'utf-8' if enc.downcase != 'utf-8' # Switch to ruby 1.9 String#encode # with backward 1.8 compatibility if data.respond_to?(:encode!) data.encode!('UTF-8', enc) else require 'iconv' data = Iconv.new('utf-8', enc).iconv(data) end end @children = parse_text_as_markdown(data) if markdown_extra? self.search_abbreviations self.substitute_markdown_inside_raw_html end self.toc = create_toc # use title if not set self.attributes[:title] ||= toc.header_element.children.join if toc.header_element # Now do the attributes magic each_element do |e| # default attribute list if default = self.ald[e.node_type.to_s] expand_attribute_list(default, e.attributes) end expand_attribute_list(e.al, e.attributes) # puts "#{e.node_type}: #{e.attributes.inspect}" end Attribute: unsafe_features Scope: global Summary: Enables execution of XML instructions. Disabled by default because of security concerns. if Maruku::Globals[:unsafe_features] self.execute_code_blocks # TODO: remove executed code blocks end end
Splits the string and calls parse_lines_as_markdown
# File lib/maruku/input/parse_block.rb, line 17 def parse_text_as_markdown(text) lines = split_lines(text) src = LineSource.new(lines) parse_blocks(src) end
If there are non-inline HTML tags in the paragraph, break them out into their own elements and make paragraphs out of everything else.
# File lib/maruku/input/parse_block.rb, line 294 def pick_apart_non_inline_html(children) output = [] para_children = [] children.each do |child| if element_is_non_inline_html?(child) unless para_children.empty? # Fix up paragraphs before non-inline elements having an extra space last_child = para_children.last if last_child.is_a?(String) && !last_child.empty? last_child.replace last_child[0..-2] end output << md_par(para_children) para_children = [] end output << child else para_children << child end end unless para_children.empty? output << md_par(para_children) end output end
# File lib/maruku/input/parse_block.rb, line 361 def read_abbreviation(src) unless (l = src.shift_line) =~ Abbreviation maruku_error "Bug: it's Andrea's fault. Tell him.\n#{l.inspect}" end abbr = $1 desc = $2 if !abbr || abbr.empty? maruku_error "Bad abbrev. abbr=#{abbr.inspect} desc=#{desc.inspect}" end self.abbreviations[abbr] = desc md_abbr_def(abbr, desc) end
# File lib/maruku/input/parse_block.rb, line 158 def read_ald(src) if (l = src.shift_line) =~ AttributeDefinitionList id = $1 al = read_attribute_list(CharSource.new($2, src)) self.ald[id] = al; md_ald(id, al) else maruku_error "Bug Bug:\n#{l.inspect}" nil end end
# File lib/maruku/input/parse_block.rb, line 490 def read_code(src) # collect all indented lines lines = [] while src.cur_line && [:code, :empty].include?(src.cur_line.md_type) lines << strip_indent(src.shift_line, 4) end #while lines.last && (lines.last.md_type == :empty ) while lines.last && lines.last.strip.size == 0 lines.pop end while lines.first && lines.first.strip.size == 0 lines.shift end return nil if lines.empty? source = lines.join("\n") md_codeblock(source) end
# File lib/maruku/input/parse_block.rb, line 685 def read_definition(src) # Read one or more terms terms = [] while src.cur_line && src.cur_line.md_type == :text terms << md_el(:definition_term, parse_span(src.shift_line)) end want_my_paragraph = false raise "Chunky Bacon!" unless src.cur_line # one optional empty if src.cur_line.md_type == :empty want_my_paragraph = true src.shift_line end raise "Chunky Bacon!" unless src.cur_line.md_type == :definition # Read one or more definitions definitions = [] while src.cur_line && src.cur_line.md_type == :definition parent_offset = src.cur_index first = src.shift_line first =~ Definition first = $1 lines, w_m_p = read_indented_content(src, 4, :definition, :definition) want_my_paragraph ||= w_m_p lines.unshift first src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) definitions << md_el(:definition_data, children) end md_el(:definition, terms + definitions, { :terms => terms, :definitions => definitions, :want_my_paragraph => want_my_paragraph }) end
# File lib/maruku/input/parse_block.rb, line 378 def read_footnote_text(src) parent_offset = src.cur_index first = src.shift_line unless first =~ FootnoteText maruku_error "Bug (it's Andrea's fault)" end id = $1 text = $2 || '' indentation = 4 #first.size-text.size # puts "id =_#{id}_; text=_#{text}_ indent=#{indentation}" break_list = [:footnote_text, :ref_definition, :definition, :abbreviation] item_type = :footnote_text lines, _ = read_indented_content(src, indentation, break_list, item_type) # add first line lines.unshift text unless text.strip.empty? src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) e = md_footnote(id, children) self.footnotes[id] = e e end
reads a header (with —– or ========)
# File lib/maruku/input/parse_block.rb, line 171 def read_header12(src) line = src.shift_line.strip al = nil # Check if there is an IAL if new_meta_data? and line =~ /^(.*?)\{(.*?)\}\s*$/ line = $1.strip ial = $2 al = read_attribute_list(CharSource.new(ial, src)) end text = parse_span line if text.empty? text = "{#{ial}}" al = nil end level = src.cur_line.md_type == :header2 ? 2 : 1; src.shift_line md_header(level, text, al) end
reads a header like '#### header ####'
# File lib/maruku/input/parse_block.rb, line 191 def read_header3(src) line = src.shift_line.strip al = nil # Check if there is an IAL if new_meta_data? and line =~ /^(.*?)\{(.*?)\}\s*$/ line = $1.strip ial = $2 al = read_attribute_list(CharSource.new(ial, src)) end level = line[/^#+/].size text = parse_span line.gsub(/\A#+|#+\z/, '') if text.empty? text = "{#{ial}}" al = nil end md_header(level, text, al) end
This is the only ugly function in the code base. It is used to read list items, descriptions, footnote text
# File lib/maruku/input/parse_block.rb, line 412 def read_indented_content(src, indentation, break_list, item_type, ial_offset=0) lines = [] # collect all indented lines saw_empty = false saw_anything_after = false break_list = Array(break_list) len = indentation - ial_offset while src.cur_line num_leading_spaces = src.cur_line.number_of_leading_spaces break if num_leading_spaces < len && ![:text, :empty].include?(src.cur_line.md_type) line = strip_indent(src.cur_line, indentation) md_type = line.md_type if md_type == :empty saw_empty = true lines << line src.shift_line next end # Unquestioningly grab anything that's deeper-indented if md_type != :code && num_leading_spaces > len lines << line src.shift_line next end # after a white line if saw_empty # we expect things to be properly aligned break if num_leading_spaces < len saw_anything_after = true else break if break_list.include?(md_type) end lines << line src.shift_line # You are only required to indent the first line of # a child paragraph. if md_type == :text while src.cur_line && src.cur_line.md_type == :text lines << strip_indent(src.shift_line, indentation) end end end # TODO fix this want_my_paragraph = saw_anything_after || (saw_empty && src.cur_line && src.cur_line.md_type == item_type) # create a new context while lines.last && lines.last.md_type == :empty lines.pop end return lines, want_my_paragraph end
Reads one list item, either ordered or unordered.
# File lib/maruku/input/parse_block.rb, line 334 def read_list_item(src) parent_offset = src.cur_index item_type = src.cur_line.md_type first = src.shift_line indentation, ial = spaces_before_first_char(first) al = read_attribute_list(CharSource.new(ial, src)) if ial ial_offset = ial ? ial.length + 3 : 0 lines, want_my_paragraph = read_indented_content(src, indentation, [], item_type, ial_offset) # in case there is a second line and this line starts a new list, format it. if !lines.empty? && [:ulist, :olist].include?(MaRuKu::MDLine.new(lines.first).md_type) lines.unshift "" end # add first line # Strip first '*', '-', '+' from first line first_changed = first.gsub(/([^\t]*)(\t)/) { $1 + " " * (TAB_SIZE - $1.length % TAB_SIZE) } stripped = first_changed[indentation, first_changed.size - 1] lines.unshift stripped src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) md_li(children, want_my_paragraph, al) end
# File lib/maruku/input/parse_block.rb, line 271 def read_paragraph(src) lines = [src.shift_line] while src.cur_line # :olist does not break case t = src.cur_line.md_type when :quote, :header3, :empty, :ref_definition, :ial, :xml_instr break when :olist, :ulist break if !src.next_line || src.next_line.md_type == t end break if src.cur_line.strip.empty? break if src.next_line && [:header1, :header2].include?(src.next_line.md_type) break if any_matching_block_extension?(src.cur_line) lines << src.shift_line end children = parse_span(lines, src) pick_apart_non_inline_html(children) end
# File lib/maruku/input/parse_block.rb, line 476 def read_quote(src) parent_offset = src.cur_index lines = [] # collect all indented lines while src.cur_line && src.cur_line.md_type == :quote lines << unquote(src.shift_line) end src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) md_quote(children) end
# File lib/maruku/input/parse_block.rb, line 238 def read_raw_html(src) extra_line = nil h = HTMLHelper.new begin l = src.shift_line h.eat_this(l) # puts "\nBLOCK:\nhtml -> #{l.inspect}" while src.cur_line && !h.is_finished? l = src.shift_line # puts "html -> #{l.inspect}" h.eat_this "\n" + l end rescue => e maruku_error "Bad block-level HTML:\n#{e.inspect.gsub(/^/, '|')}\n", src end unless h.rest =~ /^\s*$/ extra_line = h.rest end raw_html = h.stuff_you_read is_inline = HTML_INLINE_ELEMS.include?(h.first_tag) if extra_line remainder = is_inline ? parse_span(extra_line) : parse_text_as_markdown(extra_line) if extra_line.start_with?(' ') remainder[0] = ' ' + remainder[0] if remainder[0].is_a?(String) end is_inline ? [md_html(raw_html), md_par(remainder)] : [md_html(raw_html)] + remainder else [md_html(raw_html)] end end
# File lib/maruku/input/parse_block.rb, line 513 def read_ref_definition(src, out) line = src.shift_line # if link is incomplete, shift next line if src.cur_line && ![:footnote_text, :ref_definition, :definition, :abbreviation].include?(src.cur_line.md_type) && (1..3).include?(src.cur_line.number_of_leading_spaces) line << " " << src.shift_line end match = LinkRegex.match(line) unless match maruku_error "Link does not respect format: '#{line}'" and return end id = match[1] url = match[2] title = match[3] || match[4] || match[5] id = sanitize_ref_id(id) hash = self.refs[id] = { :url => url, :title => title } stuff = (match[6] || '') stuff.split.each do |couple| k, v = couple.split('=') v ||= "" v = v[1..-2] if v.start_with?('"') # strip quotes hash[k.to_sym] = v end out << md_ref_def(id, url, :title => title) end
# File lib/maruku/input/parse_block.rb, line 563 def read_table(src) head = split_cells(src.shift_line).map do |s| md_el(:head_cell, parse_span(s)) end separator = split_cells(src.shift_line) align = separator.map do |s| # ex: :-------------------: # If the separator starts and ends with a colon, # center the cell. If it's on the right, right-align, # otherwise left-align. starts = s.start_with? ':' ends = s.end_with? ':' if s.empty? # blank nil elsif starts && ends :center elsif ends :right else :left end end align.pop if align[-1].nil? # trailing blank num_columns = align.size head.pop if head.size == num_columns + 1 && head[-1].al.size == 0 # trailing blank if head.size != num_columns maruku_error "Table head does not have #{num_columns} columns: \n#{head.inspect}" tell_user "I will ignore this table." # XXX try to recover return md_br end rows = [] while src.cur_line && src.cur_line.include?('|') row = [] colCount = 0 colspan = 1 currElem = nil currIdx = 0 split_cells(src.shift_line, true).map do |s| if s.empty? # empty cells increase the colspan of the previous cell found = false colspan += 1 al = (currElem &&currElem.al) || AttributeList.new if al.size > 0 elem = find_colspan(al) if elem != nil elem[1] = colspan.to_s found = true end end al.push(["colspan", colspan.to_s]) unless found # also handles the case of and empty attribute list else colspan = 1 row[currIdx] = md_el(:cell, parse_span(s)) currElem = row[currIdx] currIdx += 1 end end # # sanity check - make sure the current row has the right number of columns (including spans) # If not, dump the table and return a break # num_columns = count_columns(row) if num_columns == head.size + 1 && row[-1].al.size == 0 #trailing blank cell row.pop num_columns -= 1 end if head.size != num_columns maruku_error "Row does not have #{head.size} columns: \n#{row.inspect} - #{num_columns}" tell_user "I will ignore this table." # XXX need to recover return md_br end rows << row end rows.unshift(head) # put the header row on the processed table md_el(:table, rows, { :align => align }) end
# File lib/maruku/input/parse_block.rb, line 138 def read_text_material(src, output) if src.cur_line.include?('|') && # if contains a pipe, it could be a table header src.next_line && src.next_line.rstrip =~ TableSeparator output << read_table(src) elsif src.next_line && [:header1, :header2].include?(src.next_line.md_type) output << read_header12(src) elsif eventually_comes_a_def_list(src) definition = read_definition(src) if output.last.kind_of?(MDElement) && output.last.node_type == :definition_list then output.last.children << definition else output << md_el(:definition_list, definition) end else # Start of a paragraph output.concat read_paragraph(src) end end
# File lib/maruku/input/parse_block.rb, line 209 def read_xml_instruction(src, output) m = /^\s*<\?((\w+)\s*)?(.*)$/.match src.shift_line raise "BugBug" unless m target = m[2] || '' code = m[3] until code.include?('?>') code << "\n" << src.shift_line end unless code =~ /\?>\s*$/ garbage = (/\?>(.*)$/.match(code))[1] maruku_error "Trailing garbage on last line: #{garbage.inspect}:\n" + code.gsub(/^/, '|'), src end code.gsub!(/\?>\s*$/, '') if target == 'mrk' && MaRuKu::Globals[:unsafe_features] result = safe_execute_code(self, code) if result if result.kind_of? String raise "Not expected" else output.push(*result) end end else output << md_xml_instr(target, code) end end
# File lib/maruku/input/parse_doc.rb, line 111 def safe_execute_code(object, code) begin object.instance_eval(code) rescue StandardError, ScriptError => e maruku_error "Exception while executing this:\n" + code.gsub(/^/, ">") + "\nThe error was:\n" + (e.inspect + "\n" + e.caller.join("\n")).gsub(/^/, "|") nil end end
# File lib/maruku/input/parse_doc.rb, line 134 def search_abbreviations abbreviations.each do |abbrev, title| reg = Regexp.new(Regexp.escape(abbrev)) replace_each_string do |s| # bug if many abbreviations are present (agorf) p = StringScanner.new(s) a = [] until p.eos? o = '' o << p.getch until p.scan(reg) or p.eos? a << o unless o.empty? a << md_abbr(abbrev.dup, title ? title.dup : nil) if p.matched == abbrev end a end end end
# File lib/maruku/input/parse_block.rb, line 549 def split_cells(s, allowBlank = false) if allowBlank if /^[|].*[|]$/ =~ s # handle the simple and decorated table cases s.split('|', -1)[1..-2] # allow blank cells, but only keep the inner elements of the cells elsif /^.*[|]$/ =~ s s.split('|', -1)[0..-2] # allow blank cells, but only keep the inner elements of the cells else s.split('|', -1) end else s.split('|').reject(&:empty?).map(&:strip) end end
(PHP Markdown extra) Search for elements that have markdown=1 or markdown=block defined
# File lib/maruku/input/parse_doc.rb, line 154 def substitute_markdown_inside_raw_html each_element(:raw_html) do |e| html = e.parsed_html next unless html html.process_markdown_inside_elements(self) end end