Parent

Files

Class/Module Index [+]

Quicksearch

CharlockHolmes::EncodingDetector

Public Class Methods

detect(str, hint_enc=nil) click to toggle source

Attempt to detect the encoding of this string

NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call

str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will

be used as an additional hint to the charset detector

Returns: a Hash with :encoding, :language, :type and :confidence

# File lib/charlock_holmes/encoding_detector.rb, line 14
def self.detect(str, hint_enc=nil)
  new.detect(str, hint_enc)
end
detect_all(str, hint_enc=nil) click to toggle source

Attempt to detect the encoding of this string, and return a list with all the possible encodings that match it.

NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call

str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will

be used as an additional hint to the charset detector

Returns: an Array with zero or more Hashes, each one of them with with :encoding, :language, :type and :confidence

# File lib/charlock_holmes/encoding_detector.rb, line 29
def self.detect_all(str, hint_enc=nil)
  new.detect_all(str, hint_enc)
end
detectable_encodings = EncodingDetector.supported_encodings click to toggle source

The list of detectable encodings supported by this library

Returns: an Array of Strings

static VALUE rb_get_supported_encodings(VALUE klass)
{
        UCharsetDetector *csd;
        UErrorCode status = U_ZERO_ERROR;
        UEnumeration *encoding_list;
        VALUE rb_encoding_list;
        int32_t enc_count;
        int32_t i;
        const char *enc_name;
        int32_t enc_name_len;

        rb_encoding_list = rb_iv_get(klass, "encoding_list");

        // lazily populate the list
        if (NIL_P(rb_encoding_list)) {
                csd = ucsdet_open(&status);

                encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
                rb_encoding_list = rb_ary_new();
                enc_count = uenum_count(encoding_list, &status);

                for(i=0; i < enc_count; i++) {
                        enc_name = uenum_next(encoding_list, &enc_name_len, &status);
                        rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
                }

                rb_iv_set(klass, "encoding_list", rb_encoding_list);
                ucsdet_close(csd);
        }

        return rb_encoding_list;
}

Public Instance Methods

detection_hash = EncodingDetector.detect str[, hint_enc] click to toggle source

Attempt to detect the encoding of this string

str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will

be used as an additional hint to the charset detector

Returns: a Hash with :encoding, :language, :type and :confidence

static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
{
        UErrorCode status = U_ZERO_ERROR;
        charlock_detector_t *detector;
        VALUE rb_str;
        VALUE rb_enc_hint;

        rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);

        Check_Type(rb_str, T_STRING);
        Data_Get_Struct(self, charlock_detector_t, detector);

        // first lets see if this is binary content
        if (detect_binary_content(detector, rb_str)) {
                return rb_encdec_binarymatch();
        }

        // if we got here - the data doesn't look like binary
        // lets try to figure out what encoding the text is in
        ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);

        if (!NIL_P(rb_enc_hint)) {
                Check_Type(rb_enc_hint, T_STRING);
                ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
        }

        return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
}
detection_hash_array = EncodingDetector.detect_all str[, hint_enc] click to toggle source

Attempt to detect the encoding of this string, and return a list with all the possible encodings that match it.

str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will

be used as an additional hint to the charset detector

Returns: an Array with zero or more Hashes,

each one of them with with :encoding, :language, :type and :confidence
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
{
        UErrorCode status = U_ZERO_ERROR;
        charlock_detector_t *detector;
        const UCharsetMatch **csm;
        VALUE rb_ret;
        int i, match_count;
        VALUE rb_str;
        VALUE rb_enc_hint;
        VALUE binary_match;

        rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);

        Check_Type(rb_str, T_STRING);
        Data_Get_Struct(self, charlock_detector_t, detector);

        rb_ret = rb_ary_new();

        // first lets see if this is binary content
        binary_match = Qnil;
        if (detect_binary_content(detector, rb_str)) {
                binary_match = rb_encdec_binarymatch();
        }

        ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);

        if (!NIL_P(rb_enc_hint)) {
                Check_Type(rb_enc_hint, T_STRING);
                ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
        }

        csm = ucsdet_detectAll(detector->csd, &match_count, &status);

        for (i = 0; i < match_count; ++i) {
                rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
        }

        if (!NIL_P(binary_match))
                rb_ary_unshift(rb_ret, binary_match);

        return rb_ret;
}
EncodingDetector#strip_tags? click to toggle source

Returns whether or not the strip_tags flag is set on this detector

Returns: Boolean

static VALUE rb_get_strip_tags(VALUE self)
{
        charlock_detector_t *detector;
        UBool val;
        VALUE rb_val;

        Data_Get_Struct(self, charlock_detector_t, detector);

        val = ucsdet_isInputFilterEnabled(detector->csd);

        rb_val = val == 1 ? Qtrue : Qfalse;

        return rb_val;
}
Also aliased as: strip_tags?
EncodingDetector#strip_tags = true click to toggle source

Enable or disable the stripping of HTML/XML tags from the input before attempting any detection

Returns: Boolean, the value passed

static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
{
        charlock_detector_t *detector;
        UBool val;

        Data_Get_Struct(self, charlock_detector_t, detector);

        val = rb_val == Qtrue ? 1 : 0;

        ucsdet_enableInputFilter(detector->csd, val);

        return rb_val;
}
strip_tags?() click to toggle source
Alias for: strip_tags

[Validate]

Generated with the Darkfish Rdoc Generator 2.