很早之前实现的,不知道有没有更好的支持同义词映射的词频统计方案。
tags = YAML.load_file('./tag_synonyms.yml')
text = "this is ruby run ubuntu is github ruby, ror is rails rails4 rails5 ror3"
TagClassifier.new(text).classify(tags)
# =>
{
"ruby on rails" => 5,
"ruby" => 2,
"github" => 1,
"ubuntu" => 1
}
同义词映射表 tag_synonyms.yml
---
- ubuntu
- centos
- redhat
- ruby on rails:
- ror
- rails
- mysql
- redis
- ruby
- java
- python
- foreman
- pry
- apache
- nginx
- node.js:
- node-js
- nodejs
- openssh
- debian
- docker
- github
- amazon
过滤器 tag_classifier.rb
# -*- encoding: utf-8 -*-
class TagClassifier
def initialize(text) @text = text @words = {} end
def classify(tags, name=nil) return(@words) unless @text.to_s.size >= 3
tags.map do |tag| if tag.is_a?(Hash) term = tag.flatten(2) tag = term[0] end
count = scan(term || tag)
if count > 0 keyword = name || tag @words[keyword] = count end end return Hash[@words.sort_by { |tag, rank| rank }.reverse] unless @words.empty? @words end alias :split :classify
private def scan(term) term = term.join('|') if term.is_a?(Array) @text.scan(/#{term}/i).count end end