Friday, March 18, 2011

Breaking Down Database Query Results in Chunks of 65536 Rows for Excel 2003 Using Office Interop and VB .Net (Memory Efficient Edition)

So, my prior post had a solution for breaking down database query results from Excel into Worksheets of 65536 rows each. As previously mentioned in the post, it is a memory hog, and probably will blow through your assembly's allocation. I came up with a more memory-friendly solution to this issue. In essence, in this approach, I swap out memory processing for I/O processing.

This solution takes the following approach:
  1. Dump out contents of database query to Excel 2003 XML format using a SqlDataReader, broken by x rows (65536 rows in this case, in the spirit of Excel 2003's limitations)
  2. Open said Excel 2003 XML file using Excel Interop
  3. Do any post-processing formatting and niceties to your spreadsheet
  4. Save the file as a normal Excel Workbook
  5. Delete the source XML file (which does indeed get gigantic)
The result is a process which consumes, at most, 40-50 MB in memory (as opposed to the hundreds of MB in the prior approach) with performance close to the prior approach's in-memory + Excel Interop approach.

Code follows.

Excel XML Header (as referenced in the code below)



<?xml version="1.0"?>
<?mso-application progid="Excel.Sheet"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:html="http://www.w3.org/TR/REC-html40">
<DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
<Author>enograles</Author>
<LastAuthor>enograles</LastAuthor>
<Created>2011-03-18T16:24:37Z</Created>
<Company></Company>
<Version>12.00</Version>
</DocumentProperties>
<ExcelWorkbook xmlns="urn:schemas-microsoft-com:office:excel">
<WindowHeight>11895</WindowHeight>
<WindowWidth>19020</WindowWidth>
<WindowTopX>120</WindowTopX>
<WindowTopY>105</WindowTopY>
<ProtectStructure>False</ProtectStructure>
<ProtectWindows>False</ProtectWindows>
</ExcelWorkbook>
<Styles>
<Style ss:ID="sDate">
<NumberFormat ss:Format="Short Date"/>
</Style>
</Styles>



Implementation Code



Public Shared Function ExportToExcelXML(ByVal conn As SqlConnection, ByVal sql As String, _
Optional ByVal path As String = Nothing) As String

Dim excel As Application

Try
Dim destination As System.IO.DirectoryInfo
Dim workbookFullPathSource As String
Dim dtResult As New System.Data.DataTable
Dim textWriter As System.IO.TextWriter
Dim pageCount As Integer = 1
Dim rowCount As Integer = 0
Dim worksheetWritten As Boolean

' First grab the result set
Dim cmdResult As New SqlCommand(sql, conn)
Dim rdr As SqlDataReader = cmdResult.ExecuteReader

' Generate a new Guid for the Workbook name.
Dim workbook_name As String = System.Guid.NewGuid().ToString()

' First validate the destination
If String.IsNullOrEmpty(path) = False Then
If System.IO.Directory.Exists(path) Then
destination = New System.IO.DirectoryInfo(path)
Else
' Attempt to create it
destination = System.IO.Directory.CreateDirectory(path)
End If
Else
' Just drop it in the executing assembly's root folder if no path is specified
destination = New System.IO.DirectoryInfo(System.Reflection.Assembly.GetExecutingAssembly.Location.Substring(0, System.Reflection.Assembly.GetExecutingAssembly.Location.LastIndexOf("\")))

End If

' Construct the full path
workbookFullPathSource = destination.FullName & "\" & workbook_name & ".xml"

' Instantiate the writer
textWriter = New System.IO.StreamWriter(workbookFullPathSource, False)

' Write the header
WriteExcelXMLHeader(textWriter)

' Iterate through the results
If rdr.HasRows Then
While rdr.Read

' Iterate the sheet if we've reached the limit
If rowCount = 65536 Then
worksheetWritten = False
pageCount += 1
rowCount = 0 ' Reset the counter to 0
End If

' Define a sheet and write the headers
If worksheetWritten = False Then
If pageCount <> 1 Then
textWriter.WriteLine(" </Table>")
textWriter.WriteLine(" </Worksheet>")
End If

textWriter.WriteLine(" <Worksheet ss:Name=""Page " & pageCount & """>")
textWriter.WriteLine(" <Table ss:ExpandedColumnCount=""" & rdr.FieldCount & """>")
textWriter.WriteLine(" <Row ss:AutoFitHeight=""0"">")

' The headers
For i As Integer = 0 To rdr.FieldCount - 1
textWriter.WriteLine(" <Cell><Data ss:Type=""String"">" & rdr.GetName(i) & "</Data></Cell>")
Next

textWriter.WriteLine(" </Row>")

' Yes, the header counts as a row
worksheetWritten = True
rowCount += 1
End If

' Write the actual data
textWriter.WriteLine(" <Row ss:AutoFitHeight=""0"">")
For i As Integer = 0 To rdr.FieldCount - 1
Dim dataType As String
Dim dataContents As String = rdr.Item(i).ToString()

If TypeOf (rdr.Item(i)) Is String Then
dataType = "String"
ElseIf TypeOf (rdr.Item(i)) Is DateTime Then
dataType = "String"
dataContents = CType(rdr.Item(i), DateTime).ToString("MM/dd/yyyy")
ElseIf IsNumeric(rdr.Item(i)) Then
dataType = "Number"
Else
dataType = "String"
End If

' The data with the proper type
textWriter.WriteLine(" <Cell><Data ss:Type=""" & dataType & """>" & dataContents & "</Data></Cell>")
Next

' Terminate the row
textWriter.WriteLine(" </Row>")

' Iterate row counter
rowCount += 1
End While
Else
textWriter.WriteLine(" <Worksheet ss:Name=""Page " & pageCount & """>")
textWriter.WriteLine(" <Table ss:ExpandedColumnCount=""" & rdr.FieldCount & """>")
textWriter.WriteLine(" <Row ss:AutoFitHeight=""0"">")
textWriter.WriteLine(" <Cell><Data ss:Type=""String"">No Results Found from Query:" & cmdResult.CommandText & "</Data></Cell>")
textWriter.WriteLine(" </Row>")
End If

' Close out the writer
textWriter.WriteLine(" </Table>")
textWriter.WriteLine(" </Worksheet>")
textWriter.WriteLine("</Workbook>")
textWriter.Close()

' Process in Excel Interop for formatting and saving to a proper Excel document
excel = New Application()
excel.DisplayAlerts = False

Dim wbSource As Workbook = excel.Workbooks.Open(workbookFullPathSource)

' For all sheets, autofit columns, bold first rows, freeze panes on all worksheets
For i As Integer = 1 To wbSource.Worksheets.Count
Dim ws As Worksheet = CType(wbSource.Worksheets(i), Worksheet)
ws.Select()
CType(ws.Cells(1, 1), Range).EntireRow.Font.Bold = True
CType(ws.Cells(2, 1), Range).Select()
excel.ActiveWindow.FreezePanes = True
ws.Columns.AutoFit()
Next

' Select the first Worksheet
CType(wbSource.Worksheets(1), Worksheet).Select()

' Save as a workbook, exit out of Excel
wbSource.SaveAs(destination.FullName & "\" & workbook_name & ".xls", FileFormat:=Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal)
wbSource.Close()
excel.Quit()

' Delete the XML source
'System.IO.File.Delete(workbookFullPathSource)

Return workbook_name & ".xls" 'workbook_full_path

Catch ex As Exception
Throw ex
Finally
If excel Is Nothing = False Then
excel.Quit()
System.Runtime.InteropServices.Marshal.ReleaseComObject(excel)
excel = Nothing
GC.Collect()
End If
End Try
End Function

''' <summary>
''' This routine will write an Excel XML header before defining worksheets
''' </summary>
''' <param name="textWriter"></param>
''' <remarks></remarks>
Private Shared Sub WriteExcelXMLHeader(ByVal textWriter As System.IO.TextWriter)
textWriter.WriteLine(My.Resources.resMain.ExcelXMLHeader)
End Sub


Thursday, March 10, 2011

Breaking Down Database Query Results in Chunks of 65536 Rows for Excel 2003 Using Office Interop and VB .Net

March 10, 2011. Still hard to believe the date. Even harder to believe is how enterprises hold onto older versions of MS Office -- in our case, MS Office 2003. Yes, Excel 2007/2010 and the Ribbon UI brings a bit of a learning curve, but having worked extensively with 2003 and 2007/2010, I can honestly say that I am faster and way more productive on the new Ribbon UI. I have drank the Kool-Aid, I love the newer versions of Office.

The new version of Office give us some niceties not afforded to the 2003 version. Some neat niceties (the Excel file being a glorified Zip file, with XML standards), and some practical niceties... in the case of the latter, the elimination of that pesky 65536 row limitation. Alas, with my enterprise still on 2003, it is a limitation we developers need to live with. We have various utilities in .Net that export results out to Excel spreadsheets, because let's face it, as an ad-hoc BI tool, Excel is pretty darn good at analyzing volumes of data.

With that in mind, when you have a requirement for large amounts of data (think in the hundreds of thousands of rows) to be published out to an Excel 2003 Workbook, it poses a little bit of a challenge. Specifically, how do you break up your results into Pages of Worksheets (65536 rows in each page) that the end-users can manipulate to their hearts' contentment? A couple of issues you run into: (a) How to break down your results in chunks of 65536 rows and (b) How to insert them blocks at a time without blowing through the Range.Value2's memory limitation (which, as far as I can tell, is not even published)?

I came up with a routine that facilitates this functionality. Just a fair word of warning, it's a bit of a hack, using standard ADO .Net objects and clever pasting of arrays to the Range.Value2 property of the Excel Interop Model while gratuitously calling the Garbage Collector so we don't blow through our memory allocation (because it uses up a bunch to begin with in persisting the results to memory as an ADO .Net DataTable).


Not my neatest of routines, but it gets the job done. An obvious improvement would be to break down the work in multiple subroutines. Another improvement could be the utilization of memory -- I've only tested this routine for the upper bounds of our data requirements (about 900,000 points of data?), so for larger data requirements, you may need to tune it a bit so it will not blow through the assembly's memory allocation. Of course, if you come up with a clever way to clean the code up a bit, I'd be happy to hear about it!

Again, I reiterate my HACK ALERT statement:


  
Public Shared Function ExportToExcel(ByVal conn As SqlConnection, ByVal sql As String, _
Optional ByVal path As String = Nothing) As String

Dim excel As Application


Try
Dim destination As System.IO.DirectoryInfo
Dim workbook_full_path As String
Dim dtResult As New System.Data.DataTable
Dim lstSheets As New Generic.List(Of String)

' First grab the result set
Dim cmdResult As New SqlCommand(sql, conn)
Dim rdr As SqlDataReader = cmdResult.ExecuteReader
dtResult.Load(rdr)

' How many sheets will we need?
Dim sheet_count As Integer = SheetCount(dtResult.Rows.Count)

' Generate a new Guid for the Workbook name.
Dim workbook_name As String = System.Guid.NewGuid().ToString()

' First validate the destination
If String.IsNullOrEmpty(path) = False Then
If System.IO.Directory.Exists(path) Then
destination = New System.IO.DirectoryInfo(path)
Else
' Attempt to create it
destination = System.IO.Directory.CreateDirectory(path)
End If
Else
' Just drop it in the executing assembly's root folder if no path is specified
destination = New System.IO.DirectoryInfo(System.Reflection.Assembly.GetExecutingAssembly.Location.Substring(0, System.Reflection.Assembly.GetExecutingAssembly.Location.LastIndexOf("\")))

End If

' Construct the full path
workbook_full_path = destination.FullName & "\" & workbook_name & ".xls"

' Create Excel objects
excel = New Application
excel.DisplayAlerts = False
Dim wb As Workbook = excel.Workbooks.Add()
Dim results_processed As Integer = 0

' Tracking dictionary so we can order the sheets
Dim dicSheets As New Generic.Dictionary(Of Integer, Worksheet)

' Only do the Excel processing if we have results
' Create the right number of sheets
' And drop data into each sheet
If sheet_count > 0 Then
For i As Integer = 1 To sheet_count

' Create a sheet
Dim ws As Worksheet

' Put in order
If i > 1 Then
ws = wb.Worksheets.Add(Type.Missing, dicSheets(i - 1))
Else
ws = wb.Worksheets.Add
End If

dicSheets.Add(i, ws)

Dim results_outstanding As Integer = dtResult.Rows.Count - results_processed

ws.Name = "Page " & i.ToString()
lstSheets.Add(ws.Name)

' Start and end positions of the DataTable. Conditions for if we have
' more than 65536 results, we need to know the breakdown for where we
' should start and end (ordinally) on the table based on what Sheet we're on
Dim dt_start_row As Integer
Dim dt_end_row As Integer
Dim sheet_end_row As Integer

' Data Table Bounds
' Positionally on the DataTable, where are we extracting data?
If i > 1 Then
dt_start_row = (i - 1) * 65536 ' Minus one because the first sheet is rows 1-65536
Else
dt_start_row = 1
End If

If results_outstanding < 65536 Then
dt_end_row = dtResult.Rows.Count
sheet_end_row = results_outstanding + 1 ' We need it to be inclusive
Else
dt_end_row = i * 65535
sheet_end_row = 65535
End If

' Create a two dimensional array
' First dimension = rows
' Second dimension = columns
' Add + 1 to rows because the first row is always the column headers
Dim result(sheet_end_row + 1, dtResult.Columns.Count) As Object

' Publish the row header
For col As Integer = 0 To dtResult.Columns.Count - 1
result(0, col) = dtResult.Columns(col).ColumnName
Next

Dim sheet_row_count As Integer = 1

' Propagate the data down in the array
' subtract one since the DataTable is zero based
For result_data_row As Integer = dt_start_row - 1 To dt_end_row - 1

For result_data_col As Integer = 0 To dtResult.Columns.Count - 1

Dim value As Object = dtResult.Rows(result_data_row)(result_data_col)

' DateTimes come up weird on Excel
If TypeOf (value) Is DateTime Then
result(sheet_row_count, result_data_col) = value.ToString() 'dtResult.Rows(result_data_row)(result_data_col).ToString()
Else
result(sheet_row_count, result_data_col) = value 'dtResult.Rows(result_data_row)(result_data_col)
End If
Next

' Iterate the sheet's row counter (not the data table's row counter)
sheet_row_count += 1
Next

' Arbitrary number of rows to drop at a time so Excel's rng.Value2 doesn't exception out. Needs further refining, obviously
If sheet_row_count >= 65535 AndAlso dtResult.Columns.Count > 30 Then

' Drop in Chunks of 700 rows at a time
' We can change this later as we performance tune
Dim row_chunks As Integer = 700
Dim excel_sheet_max As Integer = 65536

For array_row As Integer = 0 To excel_sheet_max - 1 Step (row_chunks - 1)

Dim number_of_rows As Integer

' If we are at the very end, specify as such
' as it might not be a full 700 rows
If array_row + (row_chunks - 1) > excel_sheet_max Then
number_of_rows = excel_sheet_max - array_row
Else
number_of_rows = row_chunks
End If


Dim result_segment(number_of_rows, dtResult.Columns.Count) As Object
Dim rngBegin As Range = ws.Cells(array_row + 1, 1)
Dim rngEnd As Range = ws.Cells(array_row + number_of_rows, dtResult.Columns.Count)
Dim rngAll As Range = ws.Range(rngBegin.Address & ":" & rngEnd.Address)

' Copy the 700 rows of elements to the segment
Try
Array.Copy(result, array_row * (dtResult.Columns.Count + 1), result_segment, 0, number_of_rows * (dtResult.Columns.Count + 1))
rngAll.Value2 = result_segment
Catch e As Exception

End Try

Array.Clear(result_segment, result_segment.GetLowerBound(0), result_segment.Length)
Array.Clear(result_segment, result_segment.GetLowerBound(1), result_segment.Length)
result_segment = Nothing
Next

Else
Dim rngBegin As Range = ws.Cells(1, 1)
Dim rngEnd As Range = ws.Cells(sheet_end_row + 1, dtResult.Columns.Count)
Dim rngAll As Range = ws.Range(rngBegin.Address & ":" & rngEnd.Address)

rngAll.Value2 = result
End If



' Formatting -- bold and freeze header columns, autofit columns
CType(ws.Cells(1, 1), Range).EntireRow.Font.Bold = True
CType(ws.Cells(2, 1), Range).Select()
excel.ActiveWindow.FreezePanes = True
ws.Columns.AutoFit()

' Blank out the array
Array.Clear(result, result.GetLowerBound(0), result.Length)
Array.Clear(result, result.GetLowerBound(1), result.Length)
result = Nothing
GC.Collect()

' Keep track of what we've processed
results_processed += sheet_end_row
Next


' Cleanse sheets - remove Worksheets that aren't named "Page x"
Dim lstRemoveSheets As New Generic.List(Of String)
For Each objWS As Object In wb.Worksheets
Dim ws As Worksheet = CType(objWS, Worksheet)
If lstSheets.Contains(ws.Name) = False Then
lstRemoveSheets.Add(ws.Name)
End If
Next

For Each sheet_to_delete As String In lstRemoveSheets
CType(wb.Worksheets(sheet_to_delete), Worksheet).Delete()
Next
End If

' Select the first cell of the sheet

Dim wsFirst As Worksheet '= dicSheets(1)

' Indicate we received no results
If sheet_count = 0 Then
wsFirst = wb.Worksheets(1)
CType(wsFirst.Cells(1, 1), Range).Value2 = "No results found."
Else
wsFirst = dicSheets(1)
End If

wsFirst.Select()
CType(wsFirst.Cells(1, 1), Range).Select()

' Finally return the workbook name
wb.SaveAs(workbook_full_path)
wb.Close()

' Dispose the dtResult
dtResult.Dispose()
dtResult = Nothing

Return workbook_name & ".xls" 'workbook_full_path

Catch ex As Exception
Throw New Exception("Error encountered when exporting to Excel: " & ex.Message & " " & ex.StackTrace)
'Return Nothing ' A nothing returned implies unsuccessful creation
Finally
If excel Is Nothing = False Then
excel.Quit()
System.Runtime.InteropServices.Marshal.ReleaseComObject(excel)
excel = Nothing
GC.Collect()
End If
End Try
End Function