Sqlserver
 sql >> Base de données >  >> RDS >> Sqlserver

Un problème de requête SQL très compliqué

J'ai un peu modifié votre modèle de données pour essayer de rendre un peu plus évident ce qui se passe..

CREATE TABLE [dbo].[Customer]
(
    [CustomerName]      VARCHAR(20)     NOT NULL,
    [CustomerLink]      VARBINARY(20)   NULL
)

CREATE TABLE [dbo].[CustomerIdentification]
(
    [CustomerName]      VARCHAR(20)     NOT NULL,
    [ID]                VARCHAR(50)     NOT NULL,
    [IDType]            VARCHAR(16)     NOT NULL
)

Et j'ai ajouté quelques données de test supplémentaires..

INSERT  [dbo].[Customer]
        ([CustomerName])
VALUES  ('Fred'),
        ('Bob'),
        ('Vince'),
        ('Tom'),
        ('Alice'),
        ('Matt'),
        ('Dan')

INSERT  [dbo].[CustomerIdentification]
VALUES  
        ('Fred',    'A',    'Passport'),
        ('Fred',    'A',    'SIN'),
        ('Fred',    'A',    'Drivers Licence'),
        ('Bob',     'A',    'Passport'),
        ('Bob',     'B',    'Drivers Licence'),
        ('Bob',     'C',    'Credit Card'),
        ('Vince',   'A',    'Passport'),
        ('Vince',   'B',    'SIN'),
        ('Vince',   'C',    'Credit Card'),
        ('Tom',     'A',    'Passport'),
        ('Tom',     'B',    'SIN'),
        ('Tom',     'B',    'Drivers Licence'),
        ('Alice',   'B',    'Drivers Licence'),
        ('Matt',    'X',    'Drivers Licence'),
        ('Dan',     'X',    'Drivers Licence')

Est-ce ce que vous cherchez :

;WITH [cteNonMatchingIDs] AS (
    -- Pairs where the IDType is the same, but 
    -- name and ID don't match
    SELECT  ci3.[CustomerName] AS [CustomerName1],
            ci4.[CustomerName] AS [CustomerName2]
    FROM [dbo].[CustomerIdentification] ci3
    INNER JOIN [dbo].[CustomerIdentification] ci4
        ON ci3.[IDType] = ci4.[IDType]
    WHERE ci3.[CustomerName] <> ci4.[CustomerName]
    AND ci3.[ID] <> ci4.[ID]
),
[cteMatchedPairs] AS (
    -- Pairs where the IDType and ID match, and
    -- there aren't any non matching IDs for the
    -- CustomerName
    SELECT DISTINCT 
            ci1.[CustomerName] AS [CustomerName1],
            ci2.[CustomerName] AS [CustomerName2]
    FROM [dbo].[CustomerIdentification] ci1
    LEFT JOIN [dbo].[CustomerIdentification] ci2
        ON ci1.[CustomerName] <> ci2.[CustomerName]
        AND ci1.[IDType] = ci2.[IDType] 
    WHERE ci1.[ID] = ISNULL(ci2.[ID], ci1.[ID])
    AND NOT EXISTS (
        SELECT 1
        FROM [cteNonMatchingIDs]
        WHERE ci1.[CustomerName] = [CustomerName1] -- correlated subquery
        AND ci2.[CustomerName] = [CustomerName2]
    )
    AND ci1.[CustomerName] < ci2.[CustomerName]
),
[cteMatchedList] ([CustomerName], [CustomerNameList]) AS (
    -- Turn the matched pairs into list of matching
    -- CustomerNames
    SELECT  [CustomerName1],
            [CustomerNameList]
    FROM (
        SELECT  [CustomerName1],
                CONVERT(VARCHAR(1000), '$'
                 + [CustomerName1] + '$'
                 + [CustomerName2]) AS [CustomerNameList]
        FROM [cteMatchedPairs]
        UNION ALL
        SELECT  [CustomerName2],
                CONVERT(VARCHAR(1000), '$'
                 + [CustomerName2]) AS [CustomerNameList]
        FROM [cteMatchedPairs]
    ) [cteMatchedPairs]
    UNION ALL
    SELECT  [cteMatchedList].[CustomerName],
            CONVERT(VARCHAR(1000),[CustomerNameList] + '$'
             + [cteMatchedPairs].[CustomerName2])
    FROM [cteMatchedList] -- recursive CTE
    INNER JOIN [cteMatchedPairs]
        ON RIGHT([cteMatchedList].[CustomerNameList],
         LEN([cteMatchedPairs].[CustomerName1])
        ) = [cteMatchedPairs].[CustomerName1]
),
[cteSubstringLists] AS (
    SELECT  r1.[CustomerName],
            r2.[CustomerNameList]
    FROM [cteMatchedList] r1
    INNER JOIN [cteMatchedList] r2
        ON r2.[CustomerNameList] LIKE '%' + r1.[CustomerNameList] + '%'
),
[cteCustomerLink] AS (
    SELECT DISTINCT 
            x1.[CustomerName],
            HASHBYTES('SHA1', x2.[CustomerNameList]) AS [CustomerLink]
    FROM (
        SELECT  [CustomerName],
                MAX(LEN([CustomerNameList])) AS [MAX LEN CustomerList]
        FROM [cteSubstringLists]
        GROUP BY [CustomerName]
    ) x1
    INNER JOIN (
        SELECT  [CustomerName],
                LEN([CustomerNameList]) AS [LEN CustomerList], 
                [CustomerNameList]
        FROM [cteSubstringLists]
    ) x2
        ON x1.[MAX LEN CustomerList] = x2.[LEN CustomerList]
        AND x1.[CustomerName] = x2.[CustomerName]
)
UPDATE  c
SET     [CustomerLink] = cl.[CustomerLink]
FROM [dbo].[Customer] c
INNER JOIN [cteCustomerLink] cl
    ON cl.[CustomerName] = c.[CustomerName]


SELECT *
FROM [dbo].[Customer]