T-SQL: Creating a hierarchical structure from relational data

Recently, while developing an SSIS package, I came across a scenario where I needed to create parent-child relationships between rows in a relational table. Let’s have a look at some sample data:

Source Data

Notice the values in the WBSCode column, 001 is the parent of all 001-XX rows, 001-02 is the parent of all 001-02-XX rows and so on. We need to create a relationship by populating the ParentID field so that the resulting structure can be traversed using a recursive CTE. Here’s a code snippet to create some sample data.

--create sample data
Declare @ProjectWBS TABLE
(
    ID int IDENTITY(1,1) NOT NULL Primary Key,
    ProjectCode varchar(20) NULL,
    WBSCode varchar(100) NULL,
    WBSDescription varchar(max) NULL,
    ParentID int NULL
)

INSERT @ProjectWBS (ProjectCode, WBSCode, WBSDescription, ParentID)
    Select 'Project1', '001', 'Phase 1', NULL
    union all Select 'Project1', '001-01', 'Project Mgmt', NULL
    union all Select 'Project1', '001-02', 'Requirements', NULL
    union all Select 'Project1', '001-02-01', 'Requirements Meeting', NULL
    union all Select 'Project1', '001-02-02', 'Requirements Write Up', NULL
    union all Select 'Project1', '001-03', 'Installation', NULL
    union all Select 'Project1', '001-03-01', 'Nav Install', NULL
    union all Select 'Project1', '001-03-02', 'WTE Install', NULL
    union all Select 'Project1', '001-03-03', 'Reporting Service', NULL
    union all Select 'Project1', '001-03-04', 'BI Install', NULL
    union all Select 'Project1', '001-04', 'Setup', NULL
    union all Select 'Project1', '001-04-01', 'Financials Setup', NULL
    union all Select 'Project1', '001-04-02', 'Projects Setup', NULL
    union all Select 'Project1', '001-04-03', 'Allocation Setup', NULL
    union all Select 'Project1', '001-04-04', 'WTE Setup', NULL
    union all Select 'Project1', '001-04-05', 'Users and Security', NULL
    union all Select 'Project1', '001-05', 'Training', NULL
    union all Select 'Project1', '001-06', 'Testing', NULL
    union all Select 'Project1', '001-06-01', 'Financial Testing', NULL
    union all Select 'Project1', '001-06-02', 'Project Testing', NULL
    union all Select 'Project1', '001-06-03', 'Project Allocation Testing', NULL
    union all Select 'Project1', '001-06-04', 'WTE Testing', NULL
    union all Select 'Project1', '001-06-05', 'Users and Security Testing', NULL
    union all Select 'Project1', '001-07', 'Onsite Go Live', NULL
    union all Select 'Project1', '001-08', 'Development', NULL
    union all Select 'Project1', '001-08-01', 'Overtime functionality', NULL
    union all Select 'Project1', '001-09', 'Data Migration', NULL
    union all Select 'Project1', '002', 'Phase 2', NULL
    union all Select 'Project1', '002-01', 'Monthly Support', NULL
    union all Select 'Project1', '002-01-02', 'Support Training', NULL
    union all Select 'Project1', '002-01-03', 'Support Account Mgmt', NULL

--Select * from @ProjectWBS

I started by creating the following function that returns no. of occurrences of a particular character in a given string:

CREATE FUNCTION fnc_CountChar(@string varchar(max), @char char(1) )
RETURNS INT
BEGIN
    RETURN (LEN(@string) - LEN(REPLACE(@string, @char, '')))
END

After this, I declared the following CTE:

--create hierarchical structure
;With WBSHierarchy
as
(
    Select Parent.ID, Parent.ProjectCode, Parent.WBSCode, Parent.WBSDescription,
        0 as Level, null as ParentID, Cast(null as varchar(MAX)) as ParentWBSCode
    From @ProjectWBS Parent
    Where dbo.fnc_CountChar(Parent.WBSCode, '-') = 0 --no hiphen "-" in parents

    union All

    Select Child.ID, Child.ProjectCode, Child.WBSCode, Child.WBSDescription,
        Parent.Level+1, Parent.ID as ParentID, Cast(Parent.WBSCode as varchar(max)) as ParentWBSCode
        From @ProjectWBS Child
        inner join WBSHierarchy Parent on Child.WBSCode Like Parent.WBSCode + '-%' -- childWBS = parentWBS-xx
            and dbo.fnc_CountChar(Child.WBSCode, '-') = Parent.Level + 1 --childs should have only one hiphen more than their parent
)

--Select * from WBSHierarchy order by WBSCode

Notice that in the above CTE definition, I considered all rows with single occurrence of “-” as the parent rows. Then, I used the fact that ChildWBS=ParentWBS-XX, and the no. of “-” occurrences in child WBS will be equal to one greater than its parent. Here’s the output of the above CTE.

Hierarchical Structure

Things were much simple after this, and I was able to write a single update query to populate the ParentID fields of all the rows in the original table as follows:

--update parentID
update @ProjectWBS
    set ParentID = H.ParentID
from @ProjectWBS W
inner join WBSHierarchy H on W.ID=H.ID

That’s all. Now the result can be traversed using the following recursive CTE:

--traverse the resulting structure using a recursive CTE
;With WBSHierarchy
As
(
    Select Parent.ID, Parent.ProjectCode, Parent.WBSCode, Parent.WBSDescription, 0 as Level
        from @ProjectWBS Parent
        where Parent.ParentID is null
    union all
    Select Child.ID, Child.ProjectCode, Child.WBSCode, Child.WBSDescription, Parent.Level+1 as Level
        from @ProjectWBS Child
        inner join WBSHierarchy Parent on Child.ParentID=Parent.ID
)

Select ProjectCode, WBSCode, space(Level*4)+WBSDescription as WBSDescription
from WBSHierarchy order by WBSCode

CTE Output

Update:
I got a very nice solution from Mitesh in the comments below. He used the fact that if we drop all the characters after the last “-” then we can get the parent WBS, i.e. 001-01-01 can be transformed into 001-01. Great find, Mitesh! Hence, we can do all the transformation without a recursive CTE¬†or UDF like this:

;WITH CTE
AS
(
	SELECT
		WbsCode,
		SUBSTRING(WbsCode,1,LEN(WbsCode)-
			CASE 
				WHEN CHARINDEX('-',REVERSE(WbsCode)) > 0 
				THEN CHARINDEX('-',REVERSE(WbsCode)) 
			END  
		) AS EParentID ,
		ParentID
	FROM 
		@ProjectWBS
)

UPDATE
	C
SET
	PARENTID	= P.ID
FROM
	@ProjectWBS	P
INNER JOIN
	CTE	C
ON
	C.EParentID	=	P.WBSCode

Let me know if you have any other solution to such scenario.

Advertisements

TSQL Challenge 8: Using recursive CTE for a hierarchical relationship

Last week, I submitted an entry for T-SQL challenge 8. This time, the contestants were asked to process a hierarchy using recursive CTEs without applying any filters inside the CTE. The big challenge was the condition that the CTE should not contain any filter for a specific manager.
Here is some code for populating test data.

--Populate test data
DECLARE @Employees TABLE (EmpID INT, EmpName VARCHAR(20), ReportsTo INT)
INSERT INTO @Employees(EmpID, EmpName, ReportsTo)
  SELECT 1, 'Jacob', NULL UNION ALL
  SELECT 2, 'Rui', NULL UNION ALL
  SELECT 3, 'Jacobson', NULL UNION ALL
  SELECT 4, 'Jess', 1 UNION ALL
  SELECT 5, 'Steve', 1 UNION ALL
  SELECT 6, 'Bob', 1 UNION ALL
  SELECT 7, 'Smith', 2 UNION ALL
  SELECT 8, 'Bobbey', 2 UNION ALL
  SELECT 9, 'Steffi', 3 UNION ALL
  SELECT 10, 'Bracha', 3 UNION ALL
  SELECT 11, 'John', 5 UNION ALL
  SELECT 12, 'Michael', 6 UNION ALL
  SELECT 13, 'Paul', 6 UNION ALL
  SELECT 14, 'Lana', 7 UNION ALL
  SELECT 15, 'Johnson', 7 UNION ALL
  SELECT 16, 'Mic', 8 UNION ALL
  SELECT 17, 'Stev', 8 UNION ALL
  SELECT 18, 'Paulson', 9 UNION ALL
  SELECT 19, 'Jessica', 10

I started by creating a recursive CTE for the whole table like this:

;With Hierarchy(EmpName, EmpID, Level, FullyQualifiedName)
As
(
  Select E.EmpName, E.EmpID, 0, Cast('.'+E.EmpName+'.' as Varchar(MAX))
    From @Employees E
    Where E.ReportsTo is null
  Union all
  Select E.EmpName, E.EmpID, H.Level+1, H.FullyQualifiedName+'.'+E.EmpName+'.'
    from @Employees E
    inner join Hierarchy H on H.EmpID=E.ReportsTo
)

Select Space(Level*4) + H.EmpName
  from Hierarchy H
  order by H.FullyQualifiedName

Result of CTE

Notice that I am constructing a FullyQualifiedName value for each row. This value consists of full path from root to the current person. This approach helped me in filtering the records in the final select statement where I just needed to look for a '.' + ManagerName + '.' filter. Here is a select query that extracts all subordinates for a particular manager.

Select Space(Level*4) + H.EmpName
  from Hierarchy H
  where CHARINDEX('.'+(Select Top(1) E.EmpName from @Employees E Where E.EmpName=@manager)+'.', H.FullyQualifiedName) > 0
  order by H.FullyQualifiedName

The only issue that remains is that when the query is run for a non-parent employee, we get extra spaces in the beginning. E.g. when we run the query for “Paul”, we get an output like:

Spacing issue

I solved it by inserted a sub-query in the space() expression like this:

Select Space((Level-(Select Top(1) Level from Hierarchy H2 Where H2.EmpName=@manager))*4) + EmpName
  from Hierarchy H
  where CHARINDEX('.'+(Select Top(1) E.EmpName from @Employees E Where E.EmpName=@manager)+'.', H.FullyQualifiedName) > 0
  order by H.FullyQualifiedName

Instead of two subqueries, I tried to use a join but the performance using sub-queries was better so I submitted the subquery solution. Here’s the join version I am referring, it looks very nice and clean:

Select Space((H.Level-H2.Level)*4) + H.EmpName
  from Hierarchy H
  inner join Hierarchy H2 on CHARINDEX('.'+H2.EmpName+'.', H.FullyQualifiedName) > 0
  where H2.EmpName=@manager
  order by H.FullyQualifiedName

In the last, here is the complete solution:

--Populate test data
DECLARE @Employees TABLE (EmpID INT, EmpName VARCHAR(20), ReportsTo INT)
INSERT INTO @Employees(EmpID, EmpName, ReportsTo)
  SELECT 1, 'Jacob', NULL UNION ALL
  SELECT 2, 'Rui', NULL UNION ALL
  SELECT 3, 'Jacobson', NULL UNION ALL
  SELECT 4, 'Jess', 1 UNION ALL
  SELECT 5, 'Steve', 1 UNION ALL
  SELECT 6, 'Bob', 1 UNION ALL
  SELECT 7, 'Smith', 2 UNION ALL
  SELECT 8, 'Bobbey', 2 UNION ALL
  SELECT 9, 'Steffi', 3 UNION ALL
  SELECT 10, 'Bracha', 3 UNION ALL
  SELECT 11, 'John', 5 UNION ALL
  SELECT 12, 'Michael', 6 UNION ALL
  SELECT 13, 'Paul', 6 UNION ALL
  SELECT 14, 'Lana', 7 UNION ALL
  SELECT 15, 'Johnson', 7 UNION ALL
  SELECT 16, 'Mic', 8 UNION ALL
  SELECT 17, 'Stev', 8 UNION ALL
  SELECT 18, 'Paulson', 9 UNION ALL
  SELECT 19, 'Jessica', 10

--Solution starts here
DECLARE @manager VARCHAR(20)
SELECT @manager = 'Jacob'

--CTE
;With Hierarchy(EmpName, EmpID, Level, FullyQualifiedName)
As
(
  Select E.EmpName, E.EmpID, 0, Cast('.'+E.EmpName+'.' as Varchar(MAX))
    From @Employees E
    Where E.ReportsTo is null
  Union all
  Select E.EmpName, E.EmpID, H.Level+1, H.FullyQualifiedName+'.'+E.EmpName+'.'
    from @Employees E
    inner join Hierarchy H on H.EmpID=E.ReportsTo
)

--Result
Select Space((Level-(Select Top(1) Level from Hierarchy H2 Where H2.EmpName=@manager))*4) + EmpName
  from Hierarchy H
  where CHARINDEX('.'+(Select Top(1) E.EmpName from @Employees E Where E.EmpName=@manager)+'.', H.FullyQualifiedName) > 0
  order by H.FullyQualifiedName