本文概述
Soundex是一种语音算法, 用于按声音索引名称(英语发音), 可以将来自不同字符串的SOUNDEX代码进行比较, 以查看说话时字符串听起来的相似程度。
代码的第一个字符是表达式的第一个字符, 转换为大写。该代码的第二个到第四个字符是代表表达式中字母的数字。字母A, E, I, O, U, H, W和Y会被忽略, 除非它们是字符串的第一个字母。所有A-Z范围以外的国际字母字符都被视为元音。因此, 听起来几乎相同的两个弦应该具有相同的soundex弦。例如, 单词” text”和” tixt”都产生” T230″的声音。
让我们开始吧 !
C
#include <stdio.h>
static char code[128] = { 0 };
const char* soundex(const char *s)
{
static char out[5];
int c, prev, i;
out[0] = out[4] = 0;
if (!s || !*s) return out;
out[0] = *s++;
/* first letter, though not coded, can still affect next letter: Pfister */
prev = code[(int)out[0]];
for (i = 1; *s && i < 4; s++) {
if ((c = code[(int)*s]) == prev) continue;
if (c == -1) prev = 0; /* vowel as separator */
else if (c > 0) {
out[i++] = c + '0';
prev = c;
}
}
while (i < 4) out[i++] = '0';
return out;
}
void add_code(const char *s, int c)
{
while (*s) {
code[(int)*s] = code[0x20 ^ (int)*s] = c;
s++;
}
}
void init()
{
static const char *cls[] =
{ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0};
int i;
for (i = 0; cls[i]; i++)
add_code(cls[i], i - 1);
}
用法
int main()
{
init();
/* J126 */
printf(soundex("Javascript"));
return 0;
}
C#
using System.Text.RegularExpressions;
public static class Soundex
{
public static string For(string word)
{
const int MaxSoundexCodeLength = 4;
var soundexCode = new StringBuilder();
var previousWasHOrW = false;
word = Regex.Replace(
word == null ? string.Empty : word.ToUpper(), @"[^\w\s]", string.Empty);
if (string.IsNullOrEmpty(word))
return string.Empty.PadRight(MaxSoundexCodeLength, '0');
soundexCode.Append(word.First());
for (var i = 1; i < word.Length; i++)
{
var numberCharForCurrentLetter =
GetCharNumberForLetter(word[i]);
if (i == 1 &&
numberCharForCurrentLetter ==
GetCharNumberForLetter(soundexCode[0]))
continue;
if (soundexCode.Length > 2 && previousWasHOrW &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 2])
continue;
if (soundexCode.Length > 0 &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 1])
continue;
soundexCode.Append(numberCharForCurrentLetter);
previousWasHOrW = "HW".Contains(word[i]);
}
return soundexCode
.Replace("0", string.Empty)
.ToString()
.PadRight(MaxSoundexCodeLength, '0')
.Substring(0, MaxSoundexCodeLength);
}
private static char GetCharNumberForLetter(char letter)
{
if ("BFPV".Contains(letter)) return '1';
if ("CGJKQSXZ".Contains(letter)) return '2';
if ("DT".Contains(letter)) return '3';
if ('L' == letter) return '4';
if ("MN".Contains(letter)) return '5';
if ('R' == letter) return '6';
return '0';
}
}
用法
Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614
D
D标准库(Phobos)已包含soundex函数。
import std.stdio: writeln;
import std.string: soundex;
void main() {
assert(soundex("soundex") == "S532");
assert(soundex("example") == "E251");
assert(soundex("ciondecks") == "C532");
assert(soundex("ekzampul") == "E251");
assert(soundex("Robert") == "R163");
assert(soundex("Rupert") == "R163");
assert(soundex("Rubin") == "R150");
assert(soundex("Ashcraft") == "A261");
assert(soundex("Ashcroft") == "A261");
assert(soundex("Tymczak") == "T522");
}
F#
let americanSoundex (x : string) =
let toString (xs : char list) = new System.String(xs |> Array.ofList)
let _americanSoundex =
let toUpper (x : string) = x.ToUpper()
let toArray (x : string) = x.ToCharArray()
let f1 ch =
match ch with
| 'H' | 'W' -> false
| _ -> true
let f2 ch =
match ch with
| 'B' | 'F' | 'P' | 'V' -> '1'
| 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'
| 'D' | 'T' -> '3'
| 'L' -> '4'
| 'M' | 'N' -> '5'
| 'R' -> '6'
| _ -> ch
let rec f3 xs =
match xs with
| h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))
| h :: _ -> [h]
| _ -> []
let f4 ch =
match ch with
| 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false
| _ -> true
let f5 ch first =
if ('0' <= ch && ch <= '9') then first
else ch
let f6 xs =
let len = List.length xs
seq{for i = 0 to 3 - len do yield '0'}
|> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))
|> Seq.toList
let a = x |> toUpper |> toArray |> Array.toList
let b = a |> List.filter f1 //1
let c = b |> List.map f2 //2
let d = c |> f3 //3
let e = d |> List.tail |> List.filter f4 //4
let f = f5 (d |> List.head) (a |> List.head) :: e //5
f6 f //6
if (x.Length > 0) then toString(_americanSoundex)
else "0000"
["Robert"; "Rupert"; "Robbert"; "Rubin";
"Beer"; "Bear"; "Bearer";
"Smith"; "Smyth";
"Ashcraft"; "Ashcroft";
"Tymczak"; "Pfister"]
|> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)
(*
Robert = R163
Rupert = R163
Robbert = R163
Rubin = R150
Beer = B600
Bear = B600
Bearer = B660
Smith = S530
Smyth = S530
Ashcraft = A261
Ashcroft = A261
Tymczak = T522
Pfister = P236
*)
Go
package myPackageName
import (
"bytes"
"strings"
"fmt"
)
const codeLen = 4
var codes = map[string]string{
"a": "", "b": "1", "c": "2", "d": "3", "e": "", "f": "1", "g": "2", "h": "", "i": "", "j": "2", "k": "2", "l": "4", "m": "5", "n": "5", "o": "", "p": "1", "q": "2", "r": "6", "s": "2", "t": "3", "u": "", "v": "1", "w": "", "x": "2", "y": "", "z": "2", }
func Soundex(s string) string {
var encoded bytes.Buffer
encoded.WriteByte(s[0])
for i := 1; i < len(s); i++ {
if encoded.Len() == codeLen {
break
}
previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))
var next string
if i+1 < len(s) {
next = strings.ToLower(string(s[i+1]))
}
if (current == "h" || current == "w") && (codes[previous] == codes[next]) {
i = i + 1
continue
}
if c, ok := codes[current]; ok && len(c) > 0 {
encoded.WriteByte(c[0])
}
if codes[current] == codes[next] {
i = i + 1
continue
}
}
if encoded.Len() < codeLen {
padding := strings.Repeat("0", codeLen-encoded.Len())
encoded.WriteString(padding)
}
return strings.ToUpper(encoded.String())
}
用法
func main() {
/* J126 */
fmt.Println(Soundex("Javascript"))
}
Java
private static String getCode(char c){
switch(c){
case 'B': case 'F': case 'P': case 'V':
return "1";
case 'C': case 'G': case 'J': case 'K':
case 'Q': case 'S': case 'X': case 'Z':
return "2";
case 'D': case 'T':
return "3";
case 'L':
return "4";
case 'M': case 'N':
return "5";
case 'R':
return "6";
default:
return "";
}
}
public static String soundex(String s){
String code, previous, soundex;
code = s.toUpperCase().charAt(0) + "";
previous = "7";
for(int i = 1;i < s.length();i++){
String current = getCode(s.toUpperCase().charAt(i));
if(current.length() > 0 && !current.equals(previous)){
code = code + current;
}
previous = current;
}
soundex = (code + "0000").substring(0, 4);
return soundex;
}
用法
public static void main(String[] args){
System.out.println(soundex("Soundex"));//S532
System.out.println(soundex("Example"));//E251
System.out.println(soundex("Sownteks"));//S532
System.out.println(soundex("Ekzampul"));//E251
}
JavaScript
var soundex = function(s) {
var a = s.toLowerCase().split(''), f = a.shift(), r = '', codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };
r = f +
a
.map(function(v, i, a) {
return codes[v]
})
.filter(function(v, i, a) {
return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
})
.join('');
return (r + '000').slice(0, 4).toUpperCase();
};
用法
soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126
Objective-C
你可以在Darkseed编写的github gist中找到Soundex算法Objective-C的实现。
PHP
PHP已经将soundex作为内置函数来计算字符串的soundex键。
用法
soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100
python
函数
def get_soundex(name):
"""Get the soundex code for the string"""
name = name.upper()
soundex = ""
soundex += name[0]
dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}
for char in name[1:]:
for key in dictionary.keys():
if char in key:
code = dictionary[key]
if code != soundex[-1]:
soundex += code
soundex = soundex.replace(".", "")
soundex = soundex[:4].ljust(4, "0")
return soundex
用法
list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"]
print("NAME\t\tSOUNDEX")
for name in list:
print("%s\t\t%s" % (name, get_soundex(name)))
图书馆
如果你更喜欢使用库, 则可以使用模糊包(使用C扩展(通过Pyrex)来提高速度)。
Ruby
class String
SoundexChars = 'BFPVCGJKQSXZDTLMNR'
SoundexNums = '111122222222334556'
SoundexCharsEx = '^' + SoundexChars
SoundexCharsDel = '^A-Z'
# desc: http://en.wikipedia.org/wiki/Soundex
def soundex(census = true)
str = self.upcase.delete(SoundexCharsDel)
str[0, 1] + str[1..-1].delete(SoundexCharsEx).
tr_s(SoundexChars, SoundexNums)\
[0 .. (census ? 2 : -1)].
ljust(3, '0') rescue ''
end
def sounds_like(other)
self.soundex == other.soundex
end
end
用法
%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
[word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
print "'#{word1}' "
print word1.sounds_like(word2) ? "sounds" : "does not sound"
print " like '#{word2}'\n"
end
#Soundex -> S532
#Sownteks -> S532
#'Soundex' sounds like 'Sownteks'
#Example -> E251
#Ekzampul -> E251
#'Example' sounds like 'Ekzampul'
#foo -> F000
#bar -> B600
#'foo' does not sound like 'bar'
Scala
def soundex(s:String)={
var code=s.head.toUpper.toString
var previous=getCode(code.head)
for(ch <- s.drop(1); current=getCode(ch.toUpper)){
if (!current.isEmpty && current!=previous)
code+=current
previous=current
}
code+="0000"
code.slice(0, 4)
}
def getCode(c:Char)={
val code=Map("1"->List('B', 'F', 'P', 'V'), "2"->List('C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'), "3"->List('D', 'T'), "4"->List('L'), "5"->List('M', 'N'), "6"->List('R'))
code.find(_._2.exists(_==c)) match {
case Some((k, _)) => k
case _ => ""
}
}
用法
def main(args: Array[String]): Unit = {
val tests=Map(
"Soundex" -> "S532", "Euler" -> "E460", "Gauss" -> "G200", "Hilbert" -> "H416", "Knuth" -> "K530", "Lloyd" -> "L300", "Lukasiewicz" -> "L222", "Ellery" -> "E460", "Ghosh" -> "G200", "Heilbronn" -> "H416", "Kant" -> "K530", "Ladd" -> "L300", "Lissajous" -> "L222", "Wheaton" -> "W350", "Ashcraft" -> "A226", "Burroughs" -> "B622", "Burrows" -> "B620", "O'Hara" -> "O600")
tests.foreach{(v)=>
val code=soundex(v._1)
val status=if (code==v._2) "OK" else "ERROR"
printf("Name: %-20s Code: %s Found: %s - %s\n", v._1, v._2, code, status)
}
}
Swift
在这个github仓库中, cafford编写的类是Swift语言中原始Soundex算法的实现。
//
// Soundex.swift
// speller
//
// Created by Clifford Helsel on 4/28/16.
//
// Based on standard Soundex algorithm and loosely ported from Apache Commons
// https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.html
public class Soundex {
private static let en_mapping_string = Array("01230120022455012623010202".characters)
private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)
private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet, alphabet:en_mapping_string)
private static func buildMapping(codes: Array<Character>, alphabet: Array<Character>) -> [Character:Character] {
var retval: [Character:Character] = [:]
for (index, code) in codes.enumerated() {
retval[code] = alphabet[index]
}
return retval
}
private var soundexMapping: Array<Character> = Array(repeating:" ", count:4)
private func getMappingCode(s: String, index:Int) -> Character {
let i = s.index(s.startIndex, offsetBy: index)
let mappedChar = mapChar(c:s[i])
if (index>1 && !(mappedChar=="0"))
{
let j = s.index(s.startIndex, offsetBy:index-1)
let hwChar = s[j]
if (hwChar=="H" || hwChar=="W")
{
let k = s.index(s.startIndex, offsetBy:index-2)
let prehwChar = s[k]
let firstCode = mapChar(c:prehwChar)
if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {
return "0"
}
}
}
return mappedChar
}
private func mapChar(c: Character) -> Character {
if let val = mapping[c] {
return val
}
return "0" // not specified in original Soundex specification, if character is not found, code is 0
}
public func soundex(of: String) -> String {
guard (of.characters.count>0) else {
return ""
}
let str=of.uppercased()
var out: Array<Character> = Array(" ".characters)
var last: Character = " "
var mapped: Character = " "
var incount=1
var count = 1
out[0]=str[str.startIndex]
last = getMappingCode(s:str, index: 0)
while (incount < str.characters.count && count < out.count) {
mapped = getMappingCode(s:str, index: incount)
incount += 1
if (mapped != "0") {
if (mapped != "0" && mapped != last) {
out[count]=mapped
count += 1
}
}
}
return String(out)
}
}
用法
let c = Soundex()
c.soundex(of:"Christopher") // C631
VBScript
Function getCode(c)
Select Case c
Case "B", "F", "P", "V"
getCode = "1"
Case "C", "G", "J", "K", "Q", "S", "X", "Z"
getCode = "2"
Case "D", "T"
getCode = "3"
Case "L"
getCode = "4"
Case "M", "N"
getCode = "5"
Case "R"
getCode = "6"
End Select
End Function
Function soundex(s)
Dim code, previous
code = UCase(Mid(s, 1, 1))
previous = 7
For i = 2 to (Len(s) + 1)
current = getCode(UCase(Mid(s, i, 1)))
If Len(current) > 0 And current <> previous Then
code = code & current
End If
previous = current
Next
soundex = Mid(code, 1, 4)
If Len(code) < 4 Then
soundex = soundex & String(4 - Len(code), "0")
End If
End Function
最后, 如果你知道Soundex算法在另一种语言中的实现(或者你对现有语言有更好的摘录), 请不要害羞, 并在评论框中与我们分享, 祝你玩得开心!
评论前必须登录!
注册