diff options
Diffstat (limited to 'vendor/github.com/gogs/chardet/detector.go')
-rw-r--r-- | vendor/github.com/gogs/chardet/detector.go | 136 |
1 files changed, 0 insertions, 136 deletions
diff --git a/vendor/github.com/gogs/chardet/detector.go b/vendor/github.com/gogs/chardet/detector.go deleted file mode 100644 index e11c222e..00000000 --- a/vendor/github.com/gogs/chardet/detector.go +++ /dev/null @@ -1,136 +0,0 @@ -// Package chardet ports character set detection from ICU. -package chardet - -import ( - "errors" - "sort" -) - -// Result contains all the information that charset detector gives. -type Result struct { - // IANA name of the detected charset. - Charset string - // IANA name of the detected language. It may be empty for some charsets. - Language string - // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident. - Confidence int -} - -// Detector implements charset detection. -type Detector struct { - recognizers []recognizer - stripTag bool -} - -// List of charset recognizers -var recognizers = []recognizer{ - newRecognizer_utf8(), - newRecognizer_utf16be(), - newRecognizer_utf16le(), - newRecognizer_utf32be(), - newRecognizer_utf32le(), - newRecognizer_8859_1_en(), - newRecognizer_8859_1_da(), - newRecognizer_8859_1_de(), - newRecognizer_8859_1_es(), - newRecognizer_8859_1_fr(), - newRecognizer_8859_1_it(), - newRecognizer_8859_1_nl(), - newRecognizer_8859_1_no(), - newRecognizer_8859_1_pt(), - newRecognizer_8859_1_sv(), - newRecognizer_8859_2_cs(), - newRecognizer_8859_2_hu(), - newRecognizer_8859_2_pl(), - newRecognizer_8859_2_ro(), - newRecognizer_8859_5_ru(), - newRecognizer_8859_6_ar(), - newRecognizer_8859_7_el(), - newRecognizer_8859_8_I_he(), - newRecognizer_8859_8_he(), - newRecognizer_windows_1251(), - newRecognizer_windows_1256(), - newRecognizer_KOI8_R(), - newRecognizer_8859_9_tr(), - - newRecognizer_sjis(), - newRecognizer_gb_18030(), - newRecognizer_euc_jp(), - newRecognizer_euc_kr(), - newRecognizer_big5(), - - newRecognizer_2022JP(), - newRecognizer_2022KR(), - newRecognizer_2022CN(), - - newRecognizer_IBM424_he_rtl(), - newRecognizer_IBM424_he_ltr(), - newRecognizer_IBM420_ar_rtl(), - newRecognizer_IBM420_ar_ltr(), -} - -// NewTextDetector creates a Detector for plain text. -func NewTextDetector() *Detector { - return &Detector{recognizers, false} -} - -// NewHtmlDetector creates a Detector for Html. -func NewHtmlDetector() *Detector { - return &Detector{recognizers, true} -} - -var ( - NotDetectedError = errors.New("Charset not detected.") -) - -// DetectBest returns the Result with highest Confidence. -func (d *Detector) DetectBest(b []byte) (r *Result, err error) { - var all []Result - if all, err = d.DetectAll(b); err == nil { - r = &all[0] - } - return -} - -// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order. -func (d *Detector) DetectAll(b []byte) ([]Result, error) { - input := newRecognizerInput(b, d.stripTag) - outputChan := make(chan recognizerOutput) - for _, r := range d.recognizers { - go matchHelper(r, input, outputChan) - } - outputs := make([]recognizerOutput, 0, len(d.recognizers)) - for i := 0; i < len(d.recognizers); i++ { - o := <-outputChan - if o.Confidence > 0 { - outputs = append(outputs, o) - } - } - if len(outputs) == 0 { - return nil, NotDetectedError - } - - sort.Sort(recognizerOutputs(outputs)) - dedupOutputs := make([]Result, 0, len(outputs)) - foundCharsets := make(map[string]struct{}, len(outputs)) - for _, o := range outputs { - if _, found := foundCharsets[o.Charset]; !found { - dedupOutputs = append(dedupOutputs, Result(o)) - foundCharsets[o.Charset] = struct{}{} - } - } - if len(dedupOutputs) == 0 { - return nil, NotDetectedError - } - return dedupOutputs, nil -} - -func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) { - outputChan <- r.Match(input) -} - -type recognizerOutputs []recognizerOutput - -func (r recognizerOutputs) Len() int { return len(r) } -func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence } -func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] } |