2015-04-02 80 views
0

我在SQL Server中有1000個表,每個表都是從CSV文件創建的。每張表中的數據都是相似的,每張表代表不同的一天。在數據庫中查找被複制/重複結構的表

我遇到的問題是表格的結構以及列的名稱有很多變化。

但是有些表格確實有匹配的結構,我認爲合併數據的一個好的起點是將這些數據合併在一起。

我一直在尋找一種方法來查詢數據庫,以便找到具有相同結構但尚未成功的這些表。

任何幫助將不勝感激。

+0

通常你會使用一個臨時表導入數據,然後複製/移動到你的數據庫的真實的表。 – 2015-04-02 21:15:08

+0

我最初打算做類似的事情。當我發現每個文件中有超過400列時,我改變了主意,並開始讓他們進入數據庫,同時我試圖找出該做什麼。 – 2015-04-02 21:18:28

回答

1

因此,如果表格是真正相同的,那麼試試看。實際上,我使用它來創建插入語句,如果需要它可以刪除舊錶。

IF OBJECT_ID('dbo.table1') IS NOT NULL DROP TABLE dbo.table1; 
IF OBJECT_ID('dbo.table2') IS NOT NULL DROP TABLE dbo.table2; 
IF OBJECT_ID('dbo.table3') IS NOT NULL DROP TABLE dbo.table3; 
IF OBJECT_ID('dbo.table4') IS NOT NULL DROP TABLE dbo.table4; 
IF OBJECT_ID('dbo.table5') IS NOT NULL DROP TABLE dbo.table5; 

CREATE TABLE table1 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME,AvgScore NUMERIC(18,6)); --table1 
CREATE TABLE table2 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME,AvgScore NUMERIC(18,6)); --matches table1 
CREATE TABLE table3 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME); --table3 
CREATE TABLE table4 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME); --matches table3 
CREATE TABLE table5 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME,AvgScore NUMERIC(18,6)); --matches table1 




WITH CTE_matching_Tables 
AS 
(
    SELECT 
      A.TABLE_NAME primaryTable, 
      A.total_columns, 
      COUNT(*) AS matching_columns, 
      B.TABLE_NAME AS matchedTable 
    FROM  (SELECT *, MAX(ORDINAL_POSITION) OVER (PARTITION BY Table_NAME) AS total_columns FROM INFORMATION_SCHEMA.COLUMNS) A 
    INNER JOIN (SELECT *, MAX(ORDINAL_POSITION) OVER (PARTITION BY Table_NAME) AS total_columns FROM INFORMATION_SCHEMA.COLUMNS) B 
    ON  A.TABLE_NAME < B.TABLE_NAME 
     AND A.ORDINAL_POSITION = B.ORDINAL_POSITION 
     AND A.total_columns = B.total_columns 
     AND A.COLUMN_NAME = B.COLUMN_NAME 
     AND A.DATA_TYPE = B.DATA_TYPE 
     AND A.IS_NULLABLE = B.IS_NULLABLE 
     AND (  (A.CHARACTER_MAXIMUM_LENGTH = B.CHARACTER_MAXIMUM_LENGTH) 
       OR (A.CHARACTER_MAXIMUM_LENGTH IS NULL AND B.CHARACTER_MAXIMUM_LENGTH IS NULL) 
      ) 
     AND (  (A.NUMERIC_PRECISION = B.NUMERIC_PRECISION) 
       OR (A.NUMERIC_PRECISION IS NULL AND B.NUMERIC_PRECISION IS NULL) 
      ) 
     AND (  (A.NUMERIC_SCALE = B.NUMERIC_SCALE) 
       OR (A.NUMERIC_SCALE IS NULL AND B.NUMERIC_SCALE IS NULL) 
      ) 
     AND (  (A.DATETIME_PRECISION = B.DATETIME_PRECISION) 
       OR (A.DATETIME_PRECISION IS NULL AND B.DATETIME_PRECISION IS NULL) 
      ) 
    GROUP BY A.TABLE_NAME,A.total_columns,B.TABLE_NAME 
    HAVING A.total_columns = COUNT(*) 
) 

--CTE has all table matches. I find the lowest occurring primaryTable for each matchedTable 
    --That way in my case table2 and table 5 insert into table 1 even though table2 and table5 also match 
SELECT 'INSERT INTO ' + MIN(primaryTable) + ' SELECT * FROM ' + matchedTable + '; DROP TABLE ' + matchedTable + ';' 
FROM CTE_matching_Tables 
GROUP BY matchedTable 

結果:

INSERT INTO table1 SELECT * FROM table2; DROP TABLE table2; 
INSERT INTO table3 SELECT * FROM table4; DROP TABLE table4; 
INSERT INTO table1 SELECT * FROM table5; DROP TABLE table5; 
+0

這很好,謝謝。然而,我必須修改它以包含一個檢查,即按名稱匹配的列數等於列總數。 – 2015-04-04 20:51:00

+0

哦,很好。我刪除了該行,並意外忘記重新添加它。 – Stephan 2015-04-04 21:01:24

1

以下代碼檢查包含確切列數的表以及列類型是否匹配。請注意,訂單並不重要。例如,如果您有兩個這樣的表格:

Table01 
Column01 INT 
Column02 BIT 

Table02 
Column01 BIT 
Column02 INT 

由於具有相同的結構,因此將會進行匹配。


下面的代碼很簡單 - 爲每個表我們正在創造CSV清單,它的列類型。

DECLARE @DataSource TABLE 
(
    [name] SYSNAME 
    ,[value] VARCHAR(MAX) 
); 

INSERT INTO @DataSource ([name], [value]) 
SELECT T.[name]                 
     ,ColumnsTypesCSV.[value] 
FROM [sys].[tables] T 
CROSS APPLY 
(
    SELECT STUFF 
    (
     (
      SELECT ',' + CAST([system_type_id] AS VARCHAR(12)) 
      FROM [sys].[columns] C 
      WHERE T.[object_id] = C.[object_id] 
      ORDER BY [system_type_id] 
      FOR XML PATH(''), TYPE 
     ).value('.', 'VARCHAR(MAX)') 
     ,1 
     ,1 
     ,'' 
    ) 
) ColumnsTypesCSV ([value]); 

從表中選擇看起來是這樣的:

enter image description here

現在,我們要做同樣的事情,但這次的分組由列類型CSV列表中進行:

SELECT DS.[value] 
     ,NamesCSV.[value] 
FROM @DataSource DS 
CROSS APPLY 
(
    SELECT STUFF 
    (
     (
      SELECT ',' + [name] 
      FROM @DataSource D 
      WHERE DS.[value] = D.[value] 
      ORDER BY [name] 
      FOR XML PATH(''), TYPE 
     ).value('.', 'VARCHAR(MAX)') 
     ,1 
     ,1 
     ,'' 
    ) 
) NamesCSV ([value]); 

我在AdventureWorks2012數據庫中測試這段代碼,它實際上發現表中匹配的表:

enter image description here

當然,這只是一個起點。你也可以檢查其他的東西。例如,對於每一列類型ID,您可以添加如果列是NULLNOT NULL這樣的:

TYPEID|NOTNULL,TYPEID|NULL... 
1

你會在信息視圖INFORMATION_SCHEMA.COLUMNS找到大量的數據。

這會給你(除其他外)表名,列順序,列名和列定義。

因此,舉例來說,你可以做這樣的事情:

; 
-- Create a list of table pairs. If you have reason to believe that 
-- some tables are more likely to be similar than others, you can 
-- modify this CTE as you need to. 
with A as (
    select T1.table_name 
     , t2.TABLE_NAME as other_table_Name 
    from information_Schema.TABLES t1 
     join information_schema.tables t2 
      on t1.TABLE_NAME < t2.TABLE_NAME 
) 
-- Pick all the pairs of table names ... 
select * 
from  A 
where NOT exists (
    -- where the first table does NOT have any columns ... 
    select 1 
    from INFORMATION_SCHEMA.columns c1 
    where A.TABLE_NAME = C1.TABLE_NAME 
     and not exists (
     -- ... that are NOT found in the second table ... 
      select 1 
      from INFORMATION_SCHEMA.columns c2 
      where c2.Table_Name = A.other_table_Name 
       AND c1.ordinal_position = c2.ordinal_position 
       and c1.data_type = c2.data_type 
       and ((c1.CHARACTER_MAXIMUM_LENGTH is null and 
        c2.CHARACTER_MAXIMUM_LENGTH is null) or    
        c1.CHARACTER_MAXIMUM_LENGTH = c2.CHARACTER_MAXIMUM_LENGTH) 
     ) 
    ) 
    and NOT exists (
    -- ... and the second table doesn't have any columns ... 
     select 1 
     from INFORMATION_SCHEMA.columns c1 
     where A.OTHER_TABLE_NAME = C1.TABLE_NAME 
      and not exists (
      -- that are not also found in the first table! 
       select 1 
       from INFORMATION_SCHEMA.columns c2 
       where c2.Table_Name = A.TABLE_NAME 
        AND c1.ordinal_position = c2.ordinal_position 
        and c1.data_type = c2.data_type 
        and ((c1.CHARACTER_MAXIMUM_LENGTH is null and 
         c2.CHARACTER_MAXIMUM_LENGTH is null) or 
         c1.CHARACTER_MAXIMUM_LENGTH = c2.CHARACTER_MAXIMUM_LENGTH) 
     ) 
    ) 
1

我使用校驗和INFORMATION_SCHEMA.COLUMNS一堆列。這將給你一個表和任何具有相同幻數(總和校驗和)的表的匹配。

declare @s1 sysname 
declare @n1 sysname 
declare @olds1 sysname 
declare @oldn1 sysname 
declare @curmagicnum decimal(18,0) 

if OBJECT_ID('tempdb..#alltables','U') is not null 
    drop table #alltables 

create table #alltables (schema_name sysname, 
        table_name sysname, 
        magicnum decimal(18,0)) 


select top 1 @s1 = TABLE_SCHEMA, @n1=table_name from INFORMATION_SCHEMA.TABLES order by TABLE_SCHEMA,table_name 

    while (1=1) 
    begin 

    select @curmagicnum= SUM(CAST(
     CHECKSUM (COLUMN_NAME,COLUMN_DEFAULT, IS_NULLABLE,  
     DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, 
     NUMERIC_PRECISION, NUMERIC_SCALE, DATETIME_PRECISION) 
as decimal(18,0))) 

from INFORMATION_SCHEMA.columns 
where TABLE_NAME = @n1 and [email protected] 

    insert into #alltables values (@s1,@n1,@curmagicnum) 

    set @oldn1 = @n1 
    set @olds1 = @s1 
    select top 1 @s1 = TABLE_SCHEMA, @n1=table_name 



from INFORMATION_SCHEMA.TABLES 
     where TABLE_SCHEMA+'.'+TABLE_NAME> @s1+'.'[email protected] 
     order by TABLE_SCHEMA,table_name 
     if @@ROWCOUNT=0 
      break 

    end 
    ; 


    with t1 as (select *,ROW_NUMBER() over (PARTITION by magicnum order by table_name) as count1 from #alltables) 

    select schema_name,table_name,magicnum 
    from #alltables 
    where magicnum in (select magicnum from t1 where count1> 1) 
     order by magicnum,table_name