Sqlserver
 sql >> Base de données >  >> RDS >> Sqlserver

Correspondance floue à l'aide de T-SQL

J'ai trouvé que les choses que SQL Server vous donne pour faire une correspondance floue sont assez maladroites. J'ai eu vraiment de la chance avec mes propres fonctions CLR en utilisant l'algorithme de distance de Levenshtein et une certaine pondération. En utilisant cet algorithme, j'ai ensuite créé un UDF appelé GetSimilarityScore qui prend deux chaînes et renvoie un score compris entre 0,0 et 1,0. Plus la correspondance est proche de 1,0, mieux c'est. Ensuite, interrogez avec un seuil>=0,8 environ pour obtenir les correspondances les plus probables. Quelque chose comme ça :

if object_id('tempdb..#similar') is not null drop table #similar
select a.id, (
    select top 1 x.id
   from MyTable x
   where x.id <> a.id
   order by dbo.GetSimilarityScore(a.MyField, x.MyField) desc
) as MostSimilarId
into #similar
from MyTable a

select *, dbo.GetSimilarityScore(a.MyField, c.MyField)
from MyTable a
join #similar b on a.id = b.id
join MyTable c on b.MostSimilarId = c.id

Ne le faites pas avec de très grandes tables. C'est un processus lent.

Voici les UDF CLR :

''' <summary>
''' Compute the distance between two strings.
''' </summary>
''' <param name="s1">The first of the two strings.</param>
''' <param name="s2">The second of the two strings.</param>
''' <returns>The Levenshtein cost.</returns>
<Microsoft.SqlServer.Server.SqlFunction()> _
Public Shared Function ComputeLevenstheinDistance(ByVal string1 As SqlString, ByVal string2 As SqlString) As SqlInt32
    If string1.IsNull OrElse string2.IsNull Then Return SqlInt32.Null
    Dim s1 As String = string1.Value
    Dim s2 As String = string2.Value

    Dim n As Integer = s1.Length
    Dim m As Integer = s2.Length
    Dim d As Integer(,) = New Integer(n, m) {}

    ' Step 1
    If n = 0 Then Return m
    If m = 0 Then Return n

    ' Step 2
    For i As Integer = 0 To n
        d(i, 0) = i
    Next

    For j As Integer = 0 To m
        d(0, j) = j
    Next

    ' Step 3
    For i As Integer = 1 To n
        'Step 4
        For j As Integer = 1 To m
            ' Step 5
            Dim cost As Integer = If((s2(j - 1) = s1(i - 1)), 0, 1)

            ' Step 6
            d(i, j) = Math.Min(Math.Min(d(i - 1, j) + 1, d(i, j - 1) + 1), d(i - 1, j - 1) + cost)
        Next
    Next
    ' Step 7
    Return d(n, m)
End Function

''' <summary>
''' Returns a score between 0.0-1.0 indicating how closely two strings match.  1.0 is a 100%
''' T-SQL equality match, and the score goes down from there towards 0.0 for less similar strings.
''' </summary>
<Microsoft.SqlServer.Server.SqlFunction()> _
Public Shared Function GetSimilarityScore(string1 As SqlString, string2 As SqlString) As SqlDouble
    If string1.IsNull OrElse string2.IsNull Then Return SqlInt32.Null

    Dim s1 As String = string1.Value.ToUpper().TrimEnd(" "c)
    Dim s2 As String = string2.Value.ToUpper().TrimEnd(" "c)
    If s1 = s2 Then Return 1.0F ' At this point, T-SQL would consider them the same, so I will too

    Dim flatLevScore As Double = InternalGetSimilarityScore(s1, s2)

    Dim letterS1 As String = GetLetterSimilarityString(s1)
    Dim letterS2 As String = GetLetterSimilarityString(s2)
    Dim letterScore As Double = InternalGetSimilarityScore(letterS1, letterS2)

    'Dim wordS1 As String = GetWordSimilarityString(s1)
    'Dim wordS2 As String = GetWordSimilarityString(s2)
    'Dim wordScore As Double = InternalGetSimilarityScore(wordS1, wordS2)

    If flatLevScore = 1.0F AndAlso letterScore = 1.0F Then Return 1.0F
    If flatLevScore = 0.0F AndAlso letterScore = 0.0F Then Return 0.0F

    ' Return weighted result
    Return (flatLevScore * 0.2F) + (letterScore * 0.8F)
End Function

Private Shared Function InternalGetSimilarityScore(s1 As String, s2 As String) As Double
    Dim dist As SqlInt32 = ComputeLevenstheinDistance(s1, s2)
    Dim maxLen As Integer = If(s1.Length > s2.Length, s1.Length, s2.Length)
    If maxLen = 0 Then Return 1.0F
    Return 1.0F - Convert.ToDouble(dist.Value) / Convert.ToDouble(maxLen)
End Function

''' <summary>
''' Sorts all the alpha numeric characters in the string in alphabetical order
''' and removes everything else.
''' </summary>
Private Shared Function GetLetterSimilarityString(s1 As String) As String
    Dim allChars = If(s1, "").ToUpper().ToCharArray()
    Array.Sort(allChars)
    Dim result As New StringBuilder()
    For Each ch As Char In allChars
        If Char.IsLetterOrDigit(ch) Then
            result.Append(ch)
        End If
    Next
    Return result.ToString()
End Function

''' <summary>
''' Removes all non-alpha numeric characters and then sorts
''' the words in alphabetical order.
''' </summary>
Private Shared Function GetWordSimilarityString(s1 As String) As String
    Dim words As New List(Of String)()
    Dim curWord As StringBuilder = Nothing
    For Each ch As Char In If(s1, "").ToUpper()
        If Char.IsLetterOrDigit(ch) Then
            If curWord Is Nothing Then
                curWord = New StringBuilder()
            End If
            curWord.Append(ch)
        Else
            If curWord IsNot Nothing Then
                words.Add(curWord.ToString())
                curWord = Nothing
            End If
        End If
    Next
    If curWord IsNot Nothing Then
        words.Add(curWord.ToString())
    End If

    words.Sort(StringComparer.OrdinalIgnoreCase)
    Return String.Join(" ", words.ToArray())
End Function