C# OPEN XML:从 EXCEL 获取数据到 DATATABLE 时跳过空单元格
C# OPEN XML: empty cells are getting skipped while getting data from EXCEL to DATATABLE
任务
将数据从 excel
导入到 DataTable
问题
将跳过不包含任何数据的单元格,并将行中包含数据的下一个单元格用作空列的值。
例如
A1 为空 A2 有值 Tom
然后在导入数据时 A1
得到值A2 和 A2 的 A2 保持为空
为了清楚起见,我在下面提供了一些屏幕截图
这是excel数据
这是从excel导入数据后的DataTable
代码
public class ImportExcelOpenXml
{
public static DataTable Fill_dataTable(string fileName)
{
DataTable dt = new DataTable();
using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(fileName, false))
{
WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart;
IEnumerable<Sheet> sheets = spreadSheetDocument.WorkbookPart.Workbook.GetFirstChild<Sheets>().Elements<Sheet>();
string relationshipId = sheets.First().Id.Value;
WorksheetPart worksheetPart = (WorksheetPart)spreadSheetDocument.WorkbookPart.GetPartById(relationshipId);
Worksheet workSheet = worksheetPart.Worksheet;
SheetData sheetData = workSheet.GetFirstChild<SheetData>();
IEnumerable<Row> rows = sheetData.Descendants<Row>();
foreach (Cell cell in rows.ElementAt(0))
{
dt.Columns.Add(GetCellValue(spreadSheetDocument, cell));
}
foreach (Row row in rows) //this will also include your header row...
{
DataRow tempRow = dt.NewRow();
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
tempRow[i] = GetCellValue(spreadSheetDocument, row.Descendants<Cell>().ElementAt(i));
}
dt.Rows.Add(tempRow);
}
}
dt.Rows.RemoveAt(0); //...so i'm taking it out here.
return dt;
}
public static string GetCellValue(SpreadsheetDocument document, Cell cell)
{
SharedStringTablePart stringTablePart = document.WorkbookPart.SharedStringTablePart;
string value = cell.CellValue.InnerXml;
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
return stringTablePart.SharedStringTable.ChildElements[Int32.Parse(value)].InnerText;
}
else
{
return value;
}
}
}
我的想法
我觉得
有问题
public IEnumerable<T> Descendants<T>() where T : OpenXmlElement;
如果我想使用 Descendants
计算列数
IEnumerable<Row> rows = sheetData.Descendants<<Row>();
int colCnt = rows.ElementAt(0).Count();
或
如果我使用 Descendants
获取行数
IEnumerable<Row> rows = sheetData.Descendants<<Row>();
int rowCnt = rows.Count();`
在这两种情况下,Descendants
都会跳过空单元格
有没有Descendants
的替代方案。
非常感谢您的建议
P.S:我也考虑过使用 A1、A2 等列名来获取单元格值,但为了做到这一点,我必须获取使用 Descendants
函数无法实现列和行的精确计数。
public void Read2007Xlsx()
{
try
{
DataTable dt = new DataTable();
using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(@"D:\File.xlsx", false))
{
WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart;
IEnumerable<Sheet> sheets = spreadSheetDocument.WorkbookPart.Workbook.GetFirstChild<Sheets>().Elements<Sheet>();
string relationshipId = sheets.First().Id.Value;
WorksheetPart worksheetPart = (WorksheetPart)spreadSheetDocument.WorkbookPart.GetPartById(relationshipId);
Worksheet workSheet = worksheetPart.Worksheet;
SheetData sheetData = workSheet.GetFirstChild<SheetData>();
IEnumerable<Row> rows = sheetData.Descendants<Row>();
foreach (Cell cell in rows.ElementAt(0))
{
dt.Columns.Add(GetCellValue(spreadSheetDocument, cell));
}
foreach (Row row in rows) //this will also include your header row...
{
DataRow tempRow = dt.NewRow();
int columnIndex = 0;
foreach (Cell cell in row.Descendants<Cell>())
{
// Gets the column index of the cell with data
int cellColumnIndex = (int)GetColumnIndexFromName(GetColumnName(cell.CellReference));
cellColumnIndex--; //zero based index
if (columnIndex < cellColumnIndex)
{
do
{
tempRow[columnIndex] = ""; //Insert blank data here;
columnIndex++;
}
while (columnIndex < cellColumnIndex);
}
tempRow[columnIndex] = GetCellValue(spreadSheetDocument, cell);
columnIndex++;
}
dt.Rows.Add(tempRow);
}
}
dt.Rows.RemoveAt(0); //...so i'm taking it out here.
}
catch (Exception ex)
{
}
}
/// <summary>
/// Given a cell name, parses the specified cell to get the column name.
/// </summary>
/// <param name="cellReference">Address of the cell (ie. B2)</param>
/// <returns>Column Name (ie. B)</returns>
public static string GetColumnName(string cellReference)
{
// Create a regular expression to match the column name portion of the cell name.
Regex regex = new Regex("[A-Za-z]+");
Match match = regex.Match(cellReference);
return match.Value;
}
/// <summary>
/// Given just the column name (no row index), it will return the zero based column index.
/// Note: This method will only handle columns with a length of up to two (ie. A to Z and AA to ZZ).
/// A length of three can be implemented when needed.
/// </summary>
/// <param name="columnName">Column Name (ie. A or AB)</param>
/// <returns>Zero based index if the conversion was successful; otherwise null</returns>
public static int? GetColumnIndexFromName(string columnName)
{
//return columnIndex;
string name = columnName;
int number = 0;
int pow = 1;
for (int i = name.Length - 1; i >= 0; i--)
{
number += (name[i] - 'A' + 1) * pow;
pow *= 26;
}
return number;
}
public static string GetCellValue(SpreadsheetDocument document, Cell cell)
{
SharedStringTablePart stringTablePart = document.WorkbookPart.SharedStringTablePart;
if (cell.CellValue ==null)
{
return "";
}
string value = cell.CellValue.InnerXml;
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
return stringTablePart.SharedStringTable.ChildElements[Int32.Parse(value)].InnerText;
}
else
{
return value;
}
}
Try this code, I have done little modifications and it worked for me.
public static DataTable Fill_dataTable(string filePath)
{
DataTable dt = new DataTable();
using (SpreadsheetDocument doc = SpreadsheetDocument.Open(filePath, false))
{
Sheet sheet = doc.WorkbookPart.Workbook.Sheets.GetFirstChild<Sheet>();
Worksheet worksheet = doc.WorkbookPart.GetPartById(sheet.Id.Value) as WorksheetPart.Worksheet;
IEnumerable<Row> rows = worksheet.GetFirstChild<SheetData>().Descendants<Row>();
DataTable dt = new DataTable();
List<string> columnRef = new List<string>();
foreach (Row row in rows)
{
if (row.RowIndex != null)
{
if (row.RowIndex.Value == 1)
{
foreach (Cell cell in row.Descendants<Cell>())
{
dt.Columns.Add(GetValue(doc, cell));
columnRef.Add(cell.CellReference.ToString().Substring(0, cell.CellReference.ToString().Length - 1));
}
}
else
{
dt.Rows.Add();
int i = 0;
foreach (Cell cell in row.Descendants<Cell>())
{
while (columnRef(i) + dt.Rows.Count + 1 != cell.CellReference)
{
dt.Rows(dt.Rows.Count - 1)(i) = "";
i += 1;
}
dt.Rows(dt.Rows.Count - 1)(i) = GetValue(doc, cell);
i += 1;
}
}
}
}
}
return dt;
}
如果一行的所有单元格中都有一些数据,那么一切正常。但是,如果您碰巧连续有一个空单元格,那么事情就会变得一团糟。
为什么它首先发生?
原因在于下面的代码行:
row.Descendants<Cell>().Count()
Count()
是 非空 单元格的数量,即它将忽略所有没有数据的单元格。因此,当您将 row.Descendants<Cell>().ElementAt(i)
作为参数传递给 GetCellValue
方法时,如下所示:
GetCellValue(spreadSheetDocument, row.Descendants<Cell>().ElementAt(i));
然后它将找到下一个非空单元格的内容(不一定是该列索引中的内容,i
)例如如果第一列为空并且我们调用 ElementAt(1)
,它会 returns 取而代之的是第二列中的值,整个逻辑就会变得混乱。
解决方案:我们需要处理空单元格的出现 - 本质上我们需要找出单元格的原始列索引,以防它之前有空单元格。因此,您需要替换下面的 for 循环代码
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
tempRow[i] = GetCellValue(spreadSheetDocument, row.Descendants<Cell>().ElementAt(i));
}
和
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
Cell cell = row.Descendants<Cell>().ElementAt(i);
int actualCellIndex = CellReferenceToIndex(cell);
tempRow[actualCellIndex] = GetCellValue(spreadSheetDocument, cell);
}
此外,在您的代码中添加以下方法,该方法在上述修改后的代码片段中用于获取任何单元格的 original/correct 列索引:
private static int CellReferenceToIndex(Cell cell)
{
int index = 0;
string reference = cell.CellReference.ToString().ToUpper();
foreach (char ch in reference)
{
if (Char.IsLetter(ch))
{
int value = (int)ch - (int)'A';
index = (index == 0) ? value : ((index + 1) * 26) + value;
}
else
{
return index;
}
}
return index;
}
任务
将数据从 excel
导入到 DataTable
问题
将跳过不包含任何数据的单元格,并将行中包含数据的下一个单元格用作空列的值。 例如
A1 为空 A2 有值 Tom
然后在导入数据时 A1
得到值A2 和 A2 的 A2 保持为空
为了清楚起见,我在下面提供了一些屏幕截图
这是excel数据
这是从excel导入数据后的DataTable
代码
public class ImportExcelOpenXml
{
public static DataTable Fill_dataTable(string fileName)
{
DataTable dt = new DataTable();
using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(fileName, false))
{
WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart;
IEnumerable<Sheet> sheets = spreadSheetDocument.WorkbookPart.Workbook.GetFirstChild<Sheets>().Elements<Sheet>();
string relationshipId = sheets.First().Id.Value;
WorksheetPart worksheetPart = (WorksheetPart)spreadSheetDocument.WorkbookPart.GetPartById(relationshipId);
Worksheet workSheet = worksheetPart.Worksheet;
SheetData sheetData = workSheet.GetFirstChild<SheetData>();
IEnumerable<Row> rows = sheetData.Descendants<Row>();
foreach (Cell cell in rows.ElementAt(0))
{
dt.Columns.Add(GetCellValue(spreadSheetDocument, cell));
}
foreach (Row row in rows) //this will also include your header row...
{
DataRow tempRow = dt.NewRow();
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
tempRow[i] = GetCellValue(spreadSheetDocument, row.Descendants<Cell>().ElementAt(i));
}
dt.Rows.Add(tempRow);
}
}
dt.Rows.RemoveAt(0); //...so i'm taking it out here.
return dt;
}
public static string GetCellValue(SpreadsheetDocument document, Cell cell)
{
SharedStringTablePart stringTablePart = document.WorkbookPart.SharedStringTablePart;
string value = cell.CellValue.InnerXml;
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
return stringTablePart.SharedStringTable.ChildElements[Int32.Parse(value)].InnerText;
}
else
{
return value;
}
}
}
我的想法
我觉得
有问题public IEnumerable<T> Descendants<T>() where T : OpenXmlElement;
如果我想使用 Descendants
IEnumerable<Row> rows = sheetData.Descendants<<Row>();
int colCnt = rows.ElementAt(0).Count();
或
如果我使用 Descendants
IEnumerable<Row> rows = sheetData.Descendants<<Row>();
int rowCnt = rows.Count();`
在这两种情况下,Descendants
都会跳过空单元格
有没有Descendants
的替代方案。
非常感谢您的建议
P.S:我也考虑过使用 A1、A2 等列名来获取单元格值,但为了做到这一点,我必须获取使用 Descendants
函数无法实现列和行的精确计数。
public void Read2007Xlsx()
{
try
{
DataTable dt = new DataTable();
using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(@"D:\File.xlsx", false))
{
WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart;
IEnumerable<Sheet> sheets = spreadSheetDocument.WorkbookPart.Workbook.GetFirstChild<Sheets>().Elements<Sheet>();
string relationshipId = sheets.First().Id.Value;
WorksheetPart worksheetPart = (WorksheetPart)spreadSheetDocument.WorkbookPart.GetPartById(relationshipId);
Worksheet workSheet = worksheetPart.Worksheet;
SheetData sheetData = workSheet.GetFirstChild<SheetData>();
IEnumerable<Row> rows = sheetData.Descendants<Row>();
foreach (Cell cell in rows.ElementAt(0))
{
dt.Columns.Add(GetCellValue(spreadSheetDocument, cell));
}
foreach (Row row in rows) //this will also include your header row...
{
DataRow tempRow = dt.NewRow();
int columnIndex = 0;
foreach (Cell cell in row.Descendants<Cell>())
{
// Gets the column index of the cell with data
int cellColumnIndex = (int)GetColumnIndexFromName(GetColumnName(cell.CellReference));
cellColumnIndex--; //zero based index
if (columnIndex < cellColumnIndex)
{
do
{
tempRow[columnIndex] = ""; //Insert blank data here;
columnIndex++;
}
while (columnIndex < cellColumnIndex);
}
tempRow[columnIndex] = GetCellValue(spreadSheetDocument, cell);
columnIndex++;
}
dt.Rows.Add(tempRow);
}
}
dt.Rows.RemoveAt(0); //...so i'm taking it out here.
}
catch (Exception ex)
{
}
}
/// <summary>
/// Given a cell name, parses the specified cell to get the column name.
/// </summary>
/// <param name="cellReference">Address of the cell (ie. B2)</param>
/// <returns>Column Name (ie. B)</returns>
public static string GetColumnName(string cellReference)
{
// Create a regular expression to match the column name portion of the cell name.
Regex regex = new Regex("[A-Za-z]+");
Match match = regex.Match(cellReference);
return match.Value;
}
/// <summary>
/// Given just the column name (no row index), it will return the zero based column index.
/// Note: This method will only handle columns with a length of up to two (ie. A to Z and AA to ZZ).
/// A length of three can be implemented when needed.
/// </summary>
/// <param name="columnName">Column Name (ie. A or AB)</param>
/// <returns>Zero based index if the conversion was successful; otherwise null</returns>
public static int? GetColumnIndexFromName(string columnName)
{
//return columnIndex;
string name = columnName;
int number = 0;
int pow = 1;
for (int i = name.Length - 1; i >= 0; i--)
{
number += (name[i] - 'A' + 1) * pow;
pow *= 26;
}
return number;
}
public static string GetCellValue(SpreadsheetDocument document, Cell cell)
{
SharedStringTablePart stringTablePart = document.WorkbookPart.SharedStringTablePart;
if (cell.CellValue ==null)
{
return "";
}
string value = cell.CellValue.InnerXml;
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
return stringTablePart.SharedStringTable.ChildElements[Int32.Parse(value)].InnerText;
}
else
{
return value;
}
}
Try this code, I have done little modifications and it worked for me.
public static DataTable Fill_dataTable(string filePath)
{
DataTable dt = new DataTable();
using (SpreadsheetDocument doc = SpreadsheetDocument.Open(filePath, false))
{
Sheet sheet = doc.WorkbookPart.Workbook.Sheets.GetFirstChild<Sheet>();
Worksheet worksheet = doc.WorkbookPart.GetPartById(sheet.Id.Value) as WorksheetPart.Worksheet;
IEnumerable<Row> rows = worksheet.GetFirstChild<SheetData>().Descendants<Row>();
DataTable dt = new DataTable();
List<string> columnRef = new List<string>();
foreach (Row row in rows)
{
if (row.RowIndex != null)
{
if (row.RowIndex.Value == 1)
{
foreach (Cell cell in row.Descendants<Cell>())
{
dt.Columns.Add(GetValue(doc, cell));
columnRef.Add(cell.CellReference.ToString().Substring(0, cell.CellReference.ToString().Length - 1));
}
}
else
{
dt.Rows.Add();
int i = 0;
foreach (Cell cell in row.Descendants<Cell>())
{
while (columnRef(i) + dt.Rows.Count + 1 != cell.CellReference)
{
dt.Rows(dt.Rows.Count - 1)(i) = "";
i += 1;
}
dt.Rows(dt.Rows.Count - 1)(i) = GetValue(doc, cell);
i += 1;
}
}
}
}
}
return dt;
}
如果一行的所有单元格中都有一些数据,那么一切正常。但是,如果您碰巧连续有一个空单元格,那么事情就会变得一团糟。
为什么它首先发生?
原因在于下面的代码行:
row.Descendants<Cell>().Count()
Count()
是 非空 单元格的数量,即它将忽略所有没有数据的单元格。因此,当您将 row.Descendants<Cell>().ElementAt(i)
作为参数传递给 GetCellValue
方法时,如下所示:
GetCellValue(spreadSheetDocument, row.Descendants<Cell>().ElementAt(i));
然后它将找到下一个非空单元格的内容(不一定是该列索引中的内容,i
)例如如果第一列为空并且我们调用 ElementAt(1)
,它会 returns 取而代之的是第二列中的值,整个逻辑就会变得混乱。
解决方案:我们需要处理空单元格的出现 - 本质上我们需要找出单元格的原始列索引,以防它之前有空单元格。因此,您需要替换下面的 for 循环代码
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
tempRow[i] = GetCellValue(spreadSheetDocument, row.Descendants<Cell>().ElementAt(i));
}
和
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
Cell cell = row.Descendants<Cell>().ElementAt(i);
int actualCellIndex = CellReferenceToIndex(cell);
tempRow[actualCellIndex] = GetCellValue(spreadSheetDocument, cell);
}
此外,在您的代码中添加以下方法,该方法在上述修改后的代码片段中用于获取任何单元格的 original/correct 列索引:
private static int CellReferenceToIndex(Cell cell)
{
int index = 0;
string reference = cell.CellReference.ToString().ToUpper();
foreach (char ch in reference)
{
if (Char.IsLetter(ch))
{
int value = (int)ch - (int)'A';
index = (index == 0) ? value : ((index + 1) * 26) + value;
}
else
{
return index;
}
}
return index;
}