Attempt to detect the encoding of this string
NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will
be used as an additional hint to the charset detector
Returns: a Hash with :encoding, :language, :type and :confidence
# File lib/charlock_holmes/encoding_detector.rb, line 14 def self.detect(str, hint_enc=nil) new.detect(str, hint_enc) end
Attempt to detect the encoding of this string, and return a list with all the possible encodings that match it.
NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will
be used as an additional hint to the charset detector
Returns: an Array with zero or more Hashes, each one of them with with :encoding, :language, :type and :confidence
# File lib/charlock_holmes/encoding_detector.rb, line 29 def self.detect_all(str, hint_enc=nil) new.detect_all(str, hint_enc) end
The list of detectable encodings supported by this library
Returns: an Array of Strings
static VALUE rb_get_supported_encodings(VALUE klass) { UCharsetDetector *csd; UErrorCode status = U_ZERO_ERROR; UEnumeration *encoding_list; VALUE rb_encoding_list; int32_t enc_count; int32_t i; const char *enc_name; int32_t enc_name_len; rb_encoding_list = rb_iv_get(klass, "encoding_list"); // lazily populate the list if (NIL_P(rb_encoding_list)) { csd = ucsdet_open(&status); encoding_list = ucsdet_getAllDetectableCharsets(csd, &status); rb_encoding_list = rb_ary_new(); enc_count = uenum_count(encoding_list, &status); for(i=0; i < enc_count; i++) { enc_name = uenum_next(encoding_list, &enc_name_len, &status); rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len)); } rb_iv_set(klass, "encoding_list", rb_encoding_list); ucsdet_close(csd); } return rb_encoding_list; }
Attempt to detect the encoding of this string
str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will
be used as an additional hint to the charset detector
Returns: a Hash with :encoding, :language, :type and :confidence
static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self) { UErrorCode status = U_ZERO_ERROR; charlock_detector_t *detector; VALUE rb_str; VALUE rb_enc_hint; rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint); Check_Type(rb_str, T_STRING); Data_Get_Struct(self, charlock_detector_t, detector); // first lets see if this is binary content if (detect_binary_content(detector, rb_str)) { return rb_encdec_binarymatch(); } // if we got here - the data doesn't look like binary // lets try to figure out what encoding the text is in ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status); if (!NIL_P(rb_enc_hint)) { Check_Type(rb_enc_hint, T_STRING); ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status); } return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status)); }
Attempt to detect the encoding of this string, and return a list with all the possible encodings that match it.
str - a String, what you want to detect the encoding of hint_enc - an optional String (like "UTF-8"), the encoding name which will
be used as an additional hint to the charset detector
Returns: an Array with zero or more Hashes,
each one of them with with :encoding, :language, :type and :confidence
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self) { UErrorCode status = U_ZERO_ERROR; charlock_detector_t *detector; const UCharsetMatch **csm; VALUE rb_ret; int i, match_count; VALUE rb_str; VALUE rb_enc_hint; VALUE binary_match; rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint); Check_Type(rb_str, T_STRING); Data_Get_Struct(self, charlock_detector_t, detector); rb_ret = rb_ary_new(); // first lets see if this is binary content binary_match = Qnil; if (detect_binary_content(detector, rb_str)) { binary_match = rb_encdec_binarymatch(); } ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status); if (!NIL_P(rb_enc_hint)) { Check_Type(rb_enc_hint, T_STRING); ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status); } csm = ucsdet_detectAll(detector->csd, &match_count, &status); for (i = 0; i < match_count; ++i) { rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i])); } if (!NIL_P(binary_match)) rb_ary_unshift(rb_ret, binary_match); return rb_ret; }
Generated with the Darkfish Rdoc Generator 2.