2011-04-10 41 views
2

我有一個遺留數據庫導入sql server 2008 r2,其中包含數百個表(一些有數百個列)沒有索引和主/外鍵。我手動識別的主鍵(有些是複合的)通常按列順序排列,但如果我可以幫忙的話,我不會花費數週的時間來處理它們。從表數據解釋主鍵

是否有任何工具或腳本可用於分析數據的基數,以建議或編寫可能的主鍵?

到目前爲止,我找到了following script,但它引發了一些錯誤。我會調試它,看看我能不能找出問題出在哪裏,但是想知道是否有人遇到過類似的問題並設法解決問題。從鏈接

腳本文本低於

--Natural Key Finder 
--blindman, 6/2005 
--This script searches up to 63 columns of any table for permutations of fields that represent unique keys within the dataset. 
--The search can be limited by the maximum columns per key, and by the maximum minutes for processing. 

declare @TableName varchar(50) 
declare @MaxColumns int 
declare @MaxMinutes int 

--Get a suitable test table from the current database 
/* 
set @TableName = 
     (select top 1 sysobjects.name 
     from sysobjects 
      inner join sysindexes 
       on sysobjects.id = sysindexes.id 
       and indid in (0, 1) 
      inner join syscolumns on sysobjects.id = syscolumns.id 
     where sysobjects.type = 'U' 
     group by sysobjects.name, 
      sysindexes.rowcnt 
     having count(*) between 5 and 15 
     order by rowcnt * count(*) desc) 
*/ 

set @TableName = 'calendarEvents' --Enter the name of the table to analyze. Do NOT enclose it in brackets: []! 
set @MaxColumns = 63 --Set the maximum number of columns per key you want to search for. 
set @MaxMinutes = 5 --Set a maximum time limit on the search process. 

set nocount on 

--Procedure variables 
declare @RecordCount bigint --The number of records in the table, and the maximum possible cardinality. 
declare @ColumnMask bigint --A bitmask representing the presence or absence of columns in a column set. 
declare @ColumnString varchar(8000) --A comma-delimited string including all the column names in a column set. 
declare @KeyLength int --The permutation size in columns currently being searched. 
declare @StartTime datetime --Used to track execution time of the script. 
declare @sqlstring varchar(8000) --Dynamically created sql statement. 
declare @ProcessGUID char(32) --This random value will be used to name dynamically created database objects. 
declare @SampleTableName varchar(50) --This value will hold the name of a dynamically created subset of the table data. 
declare @SubMask1 bigint --Bitmask for storing temporary column mask. 
declare @SubMask2 bigint --Bitmask for storing temporary column mask. 
declare @SubMask3 bigint --Bitmask for storing temporary column mask. 
declare @BitMask bigint --Temporary bitmask for stepping through a column mask to find active columns. 
declare @Counter int --counts the number permutations analyzed 

set @StartTime = getdate() 
set @ProcessGUID = replace(newid(), '-', '') --Generate a random string. 
set @SampleTableName = 'tbl' + @ProcessGUID --Generate a random table name. 
set @Counter = 0 
if @MaxColumns > 63 set @MaxColumns = 63 

--Create a temp table to hold the record count processed through dynamic SQL 
set @RecordCount = 
    (select rowcnt 
    from sysobjects 
     inner join sysindexes 
      on sysobjects.id = sysindexes.id 
    where sysobjects.name = @TableName 
     and sysindexes.indid in (1, 0)) 

--Create a table for listing and tracking permutations of columns. 
create table #ColumnSets 
    (ColumnSetID int identity, 
    ColumnMask bigint, --a bitmask value reprenting the presence or absence of columns in a column set. 
    ColumnCount int, --the number of columns in the set. 
    Cardinality bigint, --the number of unique values in the column set. 
    IsUnique int, -- 0: not a naturalkey. 1: natural key. 
    ColumnString varchar(4000)) --comma-delimited string listing all the columns in the column set. 

exec ('CREATE CLUSTERED INDEX IX_' + @ProcessGUID + '_1 ON #ColumnSets (ColumnMask)') 

--Create the parent records in the ColumnSets table 
insert into #ColumnSets 
    (ColumnMask, 
    ColumnCount, 
    Cardinality, 
    ColumnString) 
select case when colid < 64 then power(cast(2 as bigint), colid-1) else 0 end, --Converts the colid value to a bitmap value. 
    1, 
    @RecordCount + 1, --Cardinality is unknown, so estimate 1 higher than the maximum possible cardinality. 
    '[' + syscolumns.name + ']' 
from syscolumns 
    inner join sysobjects on syscolumns.id = sysobjects.id 
where sysobjects.name = @TableName 
    and syscolumns.prec is not null 
    and colid < 64 

--Informational message 
select 'Analyzing table "' + @TableName + '" containing ' + cast(@RecordCount as varchar(20)) + ' records in ' + cast(count(*) as varchar(3)) + ' columns.' 
from #ColumnSets 
where ColumnCount = 1 

--First we will search for large non-primary keys in a sample set of the records. If we find any large permutations that 
--are known not to be unique, then we can eliminate any subsets of these permutations from processing later on. 
select cast(power(@MaxMinutes, 0.5) -1 as varchar(4)) + ' minutes will be alloted for pre-searching.' 

declare @SampleSize int 
set @SampleSize = power(@RecordCount, (0.5)) --Sample sqrt(RecordCount) 
set @SQLString = 'select top ' + cast(@SampleSize as varchar(10)) + ' * into ' + @SampleTableName + ' from ' + @TableName 
exec (@SQLString) 

--Start with a columnset including all columns 
insert into #ColumnSets 
    (ColumnMask, 
    ColumnCount) 
select sum(ColumnMask), 
    count(*) 
from #ColumnSets 
where ColumnCount = 1 

set @KeyLength = (select count(*) from #ColumnSets where ColumnCount = 1) 

while exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1)) 
    begin 
     --show status 
     select 'Largest non-unique permutation found: ' + cast(isnull(max(ColumnCount), 0) as varchar(3)) + ' columns.' 
     from #ColumnSets 
     where IsUnique = 0 

     --If the estimated cardinality is less than the record count, we know the column set cannot 
     --possibly be unique, so mark it as non-unique. 
     update #ColumnSets 
     set IsUnique = 0 
     where Cardinality < @RecordCount 
      and IsUnique is null 

     while exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1)) 
      begin 

       --set @KeyLength = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1) 
       --Get the cardinality of all untested column sets 
       set @Counter = @Counter + 
        (Select Count(*) 
        from #ColumnSets 
        where IsUnique is null 
         and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)) 

       declare CSCursor cursor for 
        Select ColumnMask, 
         ColumnString 
        from #ColumnSets 
        where IsUnique is null 
         and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1) 

       Open CSCursor 
       Fetch next from CSCursor into @ColumnMask, @ColumnString 
       while @@FETCH_STATUS = 0 
        begin 
         --Create the column string for the bitmask 
         select @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString 
         from #ColumnSets 
         where ColumnCount = 1 
          and ColumnMask & @ColumnMask = ColumnMask 
         order by ColumnString 

         set @SQLString = 'Update #ColumnSets set ColumnString = ''' + @ColumnString + ''', Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @SampleTableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20)) 
         exec (@SQLString) 
         fetch next from CSCursor into @ColumnMask, @ColumnString 

         --Break out of this loop if we have used more than half of the alloted processing time. 
         if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break 
        end 
       Close CSCursor 
       Deallocate CSCursor 

       --Any columnsets with a cardinality equal to the rowcount represent natural keys 
       Update #ColumnSets set IsUnique = 1 where Cardinality = @SampleSize and ColumnCount > 1 
       if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break 

       --Delete subsets of combinations known not to be unique. 
       delete CandidateSets 
       from #ColumnSets CandidateSets 
        inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask 
       where SuperSets.IsUnique = 0 
        and CandidateSets.ColumnCount > 1 
        and CandidateSets.ColumnCount < SuperSets.ColumnCount 
      end 

     if dateadd(mi, power(@MaxMinutes, 0.5) -1, @StartTime) < getdate() break 

     --Split non-minimal unique sets into three subsets. We will shuffle these subset together 
     --to creat new permutations for searching. 
     declare CSCursor cursor for 
      Select ColumnMask 
      from #ColumnSets 
      where IsUnique = 1 
       and ColumnCount >= 6 

     Open CSCursor 
     Fetch next from CSCursor into @ColumnMask 
     while @@FETCH_STATUS = 0 
      begin 
       set @BitMask = 1 
       set @SubMask1 = 0 
       set @SubMask2 = 0 
       set @SubMask3 = 0 

       --Note that this permutation has already been split 
       update #ColumnSets set IsUnique = 2 where ColumnMask = @ColumnMask 

       --split the TargetMask into three distinct masks 
       while @BitMask < @ColumnMask 
        begin 
        while @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2 
        set @SubMask1 = @SubMask1 | @BitMask 
        if @BitMask > @ColumnMask/2 break 
        set @BitMask = @BitMask * 2 
        while @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2 
        set @SubMask2 = @SubMask2 | @BitMask 
        if @BitMask > @ColumnMask/2 break 
        set @BitMask = @BitMask * 2 
        while @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2 
        set @SubMask3 = @SubMask3 | @BitMask 
        if @BitMask > @ColumnMask/2 break 
        set @BitMask = @BitMask * 2 
        end  

       insert into #ColumnSets 
        (ColumnMask, 
        ColumnCount, 
        Cardinality) 
       select @SubMask1, 
        Count(*), 
        @RecordCount + 1 
       from #ColumnSets 
       where ColumnCount = 1 
        and ColumnMask & @SubMask1 = ColumnMask 

       insert into #ColumnSets 
        (ColumnMask, 
        ColumnCount, 
        Cardinality) 
       select @SubMask2, 
        Count(*), 
        @RecordCount + 1 
       from #ColumnSets 
       where ColumnCount = 1 
        and ColumnMask & @SubMask2 = ColumnMask 

       insert into #ColumnSets 
        (ColumnMask, 
        ColumnCount, 
        Cardinality) 
       select @SubMask3, 
        Count(*), 
        @RecordCount + 1 
       from #ColumnSets 
       where ColumnCount = 1 
        and ColumnMask & @SubMask3 = ColumnMask 

       fetch next from CSCursor into @ColumnMask 
      end 
     Close CSCursor 
     Deallocate CSCursor 

     --Create larger combinations of non-unique permutations for testing 
     insert into #ColumnSets 
      (ColumnMask, 
      ColumnCount, 
      Cardinality) 
     select distinct 
      ColumnSetsA.ColumnMask | ColumnSetsB.ColumnMask, 
      ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount, 
      @RecordCount + 1 
     from #ColumnSets ColumnSetsA, 
      #ColumnSets ColumnSetsB 
     where ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount > isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 0) 
      and ColumnSetsA.IsUnique = 0 
      and isnull(ColumnSetsB.IsUnique, 0) = 0 
      and ColumnSetsA.ColumnMask <> ColumnSetsB.ColumnMask 
      and ColumnSetsA.ColumnMask & ColumnSetsB.ColumnMask = 0 

     --Remove duplicates 
     delete #ColumnSets 
     from #ColumnSets 
      left outer join --DistinctSets 
       (select ColumnMask, 
        min(ColumnSetID) ColumnSetID 
       from #ColumnSets 
       group by ColumnMask) DistinctSets 
       on #ColumnSets.ColumnMask = DistinctSets.ColumnMask 
       and #ColumnSets.ColumnSetID = DistinctSets.ColumnSetID 
     where DistinctSets.ColumnSetID is null 

     --Delete subsets of combinations known not to be unique. 
     delete CandidateSets 
     from #ColumnSets CandidateSets 
      inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask 
     where SuperSets.IsUnique = 0 
      and CandidateSets.ColumnCount > 1 
      and CandidateSets.ColumnCount < SuperSets.ColumnCount 
      and CandidateSets.IsUnique is null 

     --Delete supersets of combinations already known to be unique. 
     delete CandidateSets 
     from #ColumnSets CandidateSets 
      inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask 
     where UniqueSets.IsUnique > 0 
      and CandidateSets.ColumnCount > UniqueSets.ColumnCount 
      and CandidateSets.IsUnique is null 
    end  

delete CandidateSets 
from #ColumnSets CandidateSets 
    inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask 
where SuperSets.IsUnique = 0 
    and CandidateSets.ColumnCount > 1 
    and CandidateSets.ColumnCount < SuperSets.ColumnCount 

delete 
from #ColumnSets 
where (IsUnique <> 0 or IsUnique is null) 
    and ColumnCount > 1 

set @SQLString = 'drop table ' + @SampleTableName 
exec (@SQLString) 


--Now start building permutations of columns and checking them for uniqueness. 
set @KeyLength = 1 
While @KeyLength <= @MaxColumns 
    begin 
     --If there are no more candidates left, then stop looking. 
     if (select count(*) from #ColumnSets where IsUnique is null) = 0 break 

     --If the estimated cardinality is less than the record count, we know the column set cannot possibly be unique. 
     update #ColumnSets 
     set IsUnique = 0 
     where Cardinality < @RecordCount 
      and IsUnique is null 

     --show status 
     select ColumnCount, 
      sum(case when IsUnique is null then 1 else 0 end) as Unknown, 
      sum(case when IsUnique = 0 then 1 else 0 end) as NotUnique, 
      sum(case when IsUnique = 1 then 1 else 0 end) as IsUnique 
     from #ColumnSets 
     group by ColumnCount 
     order by ColumnCount 

     --Get the cardinality of all untested column sets 
     set @Counter = @Counter + 
      (Select Count(*) 
      from #ColumnSets 
      where Cardinality >= @RecordCount 
       and IsUnique is null) 

     declare CSCursor cursor for 
      Select ColumnMask, 
       ColumnString 
      from #ColumnSets 
      where Cardinality >= @RecordCount 
       and IsUnique is null 

     Open CSCursor 
     Fetch next from CSCursor into @ColumnMask, @ColumnString 
     while @@FETCH_STATUS = 0 
      begin 
       set @SQLString = 'Update #ColumnSets set Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @TableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20)) 
       exec (@SQLString) 
       fetch next from CSCursor into @ColumnMask, @ColumnString 
       if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break 
      end 
     Close CSCursor 
     Deallocate CSCursor 

     --Delete any ColumnSets with fewer than two distinct values, as they cannot be part of a natural key 
     Delete from #ColumnSets where Cardinality < 2 

     --Any columnsets with a cardinality equal to the rowcount represent natural keys. 
     Update #ColumnSets set IsUnique = 1 where Cardinality = @RecordCount 
     if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break 

     set @SQLString = 'select ColumnString as ''' + cast(@KeyLength as varchar(3)) + '-column keys found in ' + cast(datediff(second, @StartTime, getdate()) as varchar(10)) + ' seconds.'' from #ColumnSets where IsUnique = 1 and ColumnCount = ' + cast(@KeyLength as varchar(3)) + ' order by ColumnCount, ColumnString' 
     exec (@SQLString) 

     --Remove superfluous permutations (ColumnSet has same cardinality as one of its subsets) 
     delete ColumnSets 
     from #ColumnSets ColumnSets 
      inner join #ColumnSets SubSets 
       on ColumnSets.Cardinality = SubSets.Cardinality 
     where ColumnSets.ColumnCount > SubSets.ColumnCount 
      and ColumnSets.ColumnCount = @KeyLength 
      and ColumnSets.ColumnMask & SubSets.ColumnMask = SubSets.ColumnMask 

     --Identify and insert combinations of non-unique permutations that have insufficient potential cardinality to be unique 
     insert into #ColumnSets 
      (ColumnMask, 
      ColumnCount, 
      Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns. 
      IsUnique, 
      ColumnString) 
     select ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask as ColumnMask, 
      ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount as ColumnCount, 
      min(ColumnSetsA.Cardinality * ColumnSetsB.Cardinality) as Cardinality, 
      0 as IsUnique, 
      min(ColumnSetsA.ColumnString + ', ' + ColumnSetsB.ColumnString) as ColumnString 
     from #ColumnSets ColumnSetsA 
      inner join #ColumnSets ColumnSetsB on ColumnSetsB.ColumnCount < ColumnSetsA.ColumnCount 
     where ColumnSetsA.ColumnCount = @KeyLength 
      and ColumnSetsA.Cardinality * ColumnSetsB.Cardinality <= @RecordCount 
      and ColumnSetsA.ColumnMask & ColumnSetsB.ColumnMask = 0 
      and ColumnSetsA.IsUnique = 0 
      and ColumnSetsB.IsUnique = 0 
     group by ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask, 
      ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount 

     --Insert new column sets 
     set @KeyLength = @KeyLength + 1 

     insert into #ColumnSets 
      (ColumnMask, 
      ColumnCount, 
      Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns. 
      ColumnString) 
     select ParentSets.ColumnMask | UnarySets.ColumnMask as ColumnMask, 
      ParentSets.ColumnCount + UnarySets.ColumnCount as ColumnCount, 
      ParentSets.Cardinality * UnarySets.Cardinality as Cardinality, 
      ParentSets.ColumnString + ', ' + UnarySets.ColumnString as ColumnString 
     from #ColumnSets ParentSets 
      inner join #ColumnSets UnarySets on ParentSets.IsUnique = UnarySets.IsUnique 
     where ParentSets.IsUnique = 0 
      and ParentSets.ColumnCount = @KeyLength - 1 
      and UnarySets.ColumnCount = 1 
      and ParentSets.ColumnMask & UnarySets.ColumnMask = 0 

     --Remove duplicates 
     delete #ColumnSets 
     from #ColumnSets 
      left outer join --DistinctSets 
       (select ColumnMask, 
        min(ColumnString) ColumnString 
       from #ColumnSets 
       group by ColumnMask) DistinctSets 
       on #ColumnSets.ColumnMask = DistinctSets.ColumnMask 
       and #ColumnSets.ColumnString = DistinctSets.ColumnString 
     where DistinctSets.ColumnString is null 

     --Remove supersets of combinations already known to be unique. 
     delete CandidateSets 
     from #ColumnSets CandidateSets 
      inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask 
     where UniqueSets.IsUnique = 1 
      and CandidateSets.IsUnique is null 

     --Remove subsets of combinations known not to be unique. 
     delete CandidateSets 
     from #ColumnSets CandidateSets 
      inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask 
     where SuperSets.IsUnique = 0 
      and CandidateSets.IsUnique is null 
    end 

-- Make sure that all natural keys present their columns in a uniform alphabetical order 
declare CSCursor cursor for 
    Select ColumnMask 
    from #ColumnSets 
    where IsUnique = 1 

Open CSCursor 
Fetch next from CSCursor into @ColumnMask 
while @@FETCH_STATUS = 0 
    begin 
     set @ColumnString = null 
     select @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString 
     from #ColumnSets 
     where ColumnCount = 1 
      and ColumnMask & @ColumnMask = ColumnMask 
     order by ColumnString 
     update #ColumnSets set ColumnString = @ColumnString where ColumnMask = @ColumnMask 
     fetch next from CSCursor into @ColumnMask 
    end 
Close CSCursor 
Deallocate CSCursor 


--Final output: 
select 'Found ' + cast(count(*) as varchar(10)) + ' natural keys with ' + cast(@MaxColumns as varchar(10)) + ' or fewer columns in ' + cast(datediff(minute, @StartTime, getdate()) as varchar(10)) + ' minutes.' from #ColumnSets where IsUnique = 1 

select ColumnString as NaturalKeys 
from #ColumnSets 
where IsUnique = 1 
order by ColumnCount, ColumnString 

select convert(varchar(20), @Counter) + ' permutations tested.' 

drop table #ColumnSets 

回答

3

如果你正在擁抱的微軟辦公軟件的可能性是使用SSIS。集成服務有一個數據分析任務。

這個網站有關於如何使用它的基本功能:http://consultingblogs.emc.com/jamiethomson/archive/2008/03/04/ssis-data-profiling-task-part-8-candidate-key.aspx

作爲警告它可能需要很長的時間取決於表的大小和列數在您的候選鍵包含。輸出對於識別按鍵的強度非常有用。在你的情況下,你會尋找100%的物品。

+0

非常感謝,看起來像我需要的東西。 – 2011-04-11 08:18:19