module Twitter::Extractor

A module for including Tweet parsing in a class. This module provides function for the extraction and processing of usernames, lists, URLs and hashtags.

Public Instance Methods

extract_cashtags(text) { |cashtag_text| ... } click to toggle source

Extracts a list of all cashtags included in the Tweet text. If the text is nil or contains no cashtags an empty array will be returned. The array returned will not include the leading $ character.

If a block is given then it will be called for each cashtag.

# File lib/twitter-text/extractor.rb, line 297
def extract_cashtags(text, &block) # :yields: cashtag_text
  cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
  cashtags.each(&block) if block_given?
  cashtags
end
extract_cashtags_with_indices(text) { |cashtag_text, start, end| ... } click to toggle source

Extracts a list of all cashtags included in the Tweet text. If the text is nil or contains no cashtags an empty array will be returned. The array returned will not include the leading $ character.

If a block is given then it will be called for each cashtag.

# File lib/twitter-text/extractor.rb, line 309
def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
  return [] unless text =~ /\$/

  tags = []
  text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
    match_data = $~
    start_position = match_data.char_begin(2)
    end_position = match_data.char_end(3)
    tags << {
      :cashtag => cash_text,
      :indices => [start_position, end_position]
    }
  end

  tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
  tags
end
extract_entities_with_indices(text, options = {}, &block) click to toggle source

Extracts all usernames, lists, hashtags and URLs in the Tweet text along with the indices for where the entity ocurred If the text is nil or contains no entity an empty array will be returned.

If a block is given then it will be called for each entity.

# File lib/twitter-text/extractor.rb, line 67
def extract_entities_with_indices(text, options = {}, &block)
  # extract all entities
  entities = extract_urls_with_indices(text, options) +
             extract_hashtags_with_indices(text, :check_url_overlap => false) +
             extract_mentions_or_lists_with_indices(text) +
             extract_cashtags_with_indices(text)

  return [] if entities.empty?

  entities = remove_overlapping_entities(entities)

  entities.each(&block) if block_given?
  entities
end
extract_hashtags(text) { |hashtag_text| ... } click to toggle source

Extracts a list of all hashtags included in the Tweet text. If the text is nil or contains no hashtags an empty array will be returned. The array returned will not include the leading # character.

If a block is given then it will be called for each hashtag.

# File lib/twitter-text/extractor.rb, line 246
def extract_hashtags(text, &block) # :yields: hashtag_text
  hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
  hashtags.each(&block) if block_given?
  hashtags
end
extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) { |hashtag_text, start, end| ... } click to toggle source

Extracts a list of all hashtags included in the Tweet text. If the text is nil or contains no hashtags an empty array will be returned. The array returned will not include the leading # character.

If a block is given then it will be called for each hashtag.

# File lib/twitter-text/extractor.rb, line 258
def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
  return [] unless text =~ /[##]/

  tags = []
  text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
    match_data = $~
    start_position = match_data.char_begin(2)
    end_position = match_data.char_end(3)
    after = $'
    unless after =~ Twitter::Regex[:end_hashtag_match]
      tags << {
        :hashtag => hash_text,
        :indices => [start_position, end_position]
      }
    end
  end

  if options[:check_url_overlap]
    # extract URLs
    urls = extract_urls_with_indices(text)
    unless urls.empty?
      tags.concat(urls)
      # remove duplicates
      tags = remove_overlapping_entities(tags)
      # remove URL entities
      tags.reject!{|entity| !entity[:hashtag] }
    end
  end

  tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
  tags
end
extract_mentioned_screen_names(text) { |username| ... } click to toggle source

Extracts a list of all usernames mentioned in the Tweet text. If the text is nil or contains no username mentions an empty array will be returned.

If a block is given then it will be called for each username.

# File lib/twitter-text/extractor.rb, line 87
def extract_mentioned_screen_names(text, &block) # :yields: username
  screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
  screen_names.each(&block) if block_given?
  screen_names
end
extract_mentioned_screen_names_with_indices(text) { |username, start, end| ... } click to toggle source

Extracts a list of all usernames mentioned in the Tweet text along with the indices for where the mention ocurred. If the text is nil or contains no username mentions, an empty array will be returned.

If a block is given, then it will be called with each username, the start index, and the end index in the text.

# File lib/twitter-text/extractor.rb, line 100
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
  return [] unless text

  possible_screen_names = []
  extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
    next unless list_slug.empty?
    possible_screen_names << {
      :screen_name => screen_name,
      :indices => [start_position, end_position]
    }
  end

  if block_given?
    possible_screen_names.each do |mention|
      yield mention[:screen_name], mention[:indices].first, mention[:indices].last
    end
  end

  possible_screen_names
end
extract_mentions_or_lists_with_indices(text) { |username, list_slug, start, end| ... } click to toggle source

Extracts a list of all usernames or lists mentioned in the Tweet text along with the indices for where the mention ocurred. If the text is nil or contains no username or list mentions, an empty array will be returned.

If a block is given, then it will be called with each username, list slug, the start index, and the end index in the text. The list_slug will be an empty stirng if this is a username mention.

# File lib/twitter-text/extractor.rb, line 129
def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
  return [] unless text =~ /[@@]/

  possible_entries = []
  text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
    match_data = $~
    after = $'
    unless after =~ Twitter::Regex[:end_mention_match]
      start_position = match_data.char_begin(3) - 1
      end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
      possible_entries << {
        :screen_name => screen_name,
        :list_slug => list_slug || "",
        :indices => [start_position, end_position]
      }
    end
  end

  if block_given?
    possible_entries.each do |mention|
      yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
    end
  end

  possible_entries
end
extract_reply_screen_name(text) { |username| ... } click to toggle source

Extracts the username username replied to in the Tweet text. If the text is nil or is not a reply nil will be returned.

If a block is given then it will be called with the username replied to (if any)

# File lib/twitter-text/extractor.rb, line 160
def extract_reply_screen_name(text) # :yields: username
  return nil unless text

  possible_screen_name = text.match(Twitter::Regex[:valid_reply])
  return unless possible_screen_name.respond_to?(:captures)
  return if $' =~ Twitter::Regex[:end_mention_match]
  screen_name = possible_screen_name.captures.first
  yield screen_name if block_given?
  screen_name
end
extract_urls(text) { |url| ... } click to toggle source

Extracts a list of all URLs included in the Tweet text. If the text is nil or contains no URLs an empty array will be returned.

If a block is given then it will be called for each URL.

# File lib/twitter-text/extractor.rb, line 176
def extract_urls(text, &block) # :yields: url
  urls = extract_urls_with_indices(text).map{|u| u[:url]}
  urls.each(&block) if block_given?
  urls
end
extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) { |url, start, end| ... } click to toggle source

Extracts a list of all URLs included in the Tweet text along with the indices. If the text is nil or contains no URLs an empty array will be returned.

If a block is given then it will be called for each URL.

# File lib/twitter-text/extractor.rb, line 187
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
  return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
  urls = []

  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
    valid_url_match_data = $~

    start_position = valid_url_match_data.char_begin(3)
    end_position = valid_url_match_data.char_end(3)

    # If protocol is missing and domain contains non-ASCII characters,
    # extract ASCII-only domains.
    if !protocol
      next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
      last_url = nil
      domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
        last_url = {
          :url => ascii_domain,
          :indices => [start_position + $~.char_begin(0),
                       start_position + $~.char_end(0)]
        }
        if path ||
            ascii_domain =~ Twitter::Regex[:valid_special_short_domain] ||
            ascii_domain !~ Twitter::Regex[:invalid_short_domain]
          urls << last_url
        end
      end

      # no ASCII-only domain found. Skip the entire URL
      next unless last_url

      # last_url only contains domain. Need to add path and query if they exist.
      if path
        # last_url was not added. Add it to urls here.
        last_url[:url] = url.sub(domain, last_url[:url])
        last_url[:indices][1] = end_position
      end
    else
      # In the case of t.co URLs, don't allow additional path characters
      if url =~ Twitter::Regex[:valid_tco_url]
        url = $&
        end_position = start_position + url.char_length
      end
      urls << {
        :url => url,
        :indices => [start_position, end_position]
      }
    end
  end
  urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
  urls
end
remove_overlapping_entities(entities) click to toggle source

Remove overlapping entities. This returns a new array with no overlapping entities.

# File lib/twitter-text/extractor.rb, line 51
def remove_overlapping_entities(entities)
  # sort by start index
  entities = entities.sort_by{|entity| entity[:indices].first}

  # remove duplicates
  prev = nil
  entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
  entities
end