C#实现高效读取CSV文件的完整指南

CSV(Comma-Separated Values)是最常见的数据交换格式之一。在C#中高效读取CSV文件需要考虑性能、内存使用和错误处理等多个方面。本文将提供从基础到高级的完整解决方案。

图片[1]_C#实现高效读取CSV文件的完整指南_知途无界

1. 基础方法:使用 TextFieldParser

Microsoft.VisualBasic.TextFieldParser 是一个强大且易用的CSV解析工具,性能优秀且内置在.NET Framework中。

安装和引用

// 在 .NET Core/.NET 5+ 项目中,需要添加 Microsoft.VisualBasic NuGet 包
// Install-Package Microsoft.VisualBasic

using Microsoft.VisualBasic.FileIO;
using System.Globalization;

基本实现

public class CsvReaderUsingTextFieldParser
{
    public static List<string[]> ReadCsvBasic(string filePath)
    {
        var records = new List<string[]>();
        
        using var parser = new TextFieldParser(filePath)
        {
            TextFieldType = FieldType.Delimited,
            Delimiters = new string[] { "," },
            HasFieldsEnclosedInQuotes = true,
            TrimWhiteSpace = true,
            Culture = CultureInfo.InvariantCulture
        };

        while (!parser.EndOfData)
        {
            try
            {
                string[] fields = parser.ReadFields();
                records.Add(fields);
            }
            catch (MalformedLineException ex)
            {
                Console.WriteLine($"行解析错误 {parser.LineNumber}: {ex.Message}");
                // 可以选择跳过错误行或终止处理
                continue;
            }
        }

        return records;
    }
}

高级配置版本

public class AdvancedCsvReader
{
    public class CsvOptions
    {
        public char Delimiter { get; set; } = ',';
        public bool HasHeader { get; set; } = false;
        public bool TrimWhitespace { get; set; } = true;
        public bool SkipEmptyLines { get; set; } = true;
        public string[] CommentTokens { get; set; } = { "#", "//" };
        public int BufferSize { get; set; } = 65536; // 64KB buffer
    }

    public static List<Dictionary<string, string>> ReadCsvWithHeaders(string filePath, CsvOptions options = null)
    {
        options ??= new CsvOptions();
        
        var records = new List<Dictionary<string, string>>();
        string[] headers = null;

        using var parser = new TextFieldParser(filePath)
        {
            TextFieldType = FieldType.Delimited,
            Delimiters = new string[] { options.Delimiter.ToString() },
            HasFieldsEnclosedInQuotes = true,
            TrimWhiteSpace = options.TrimWhitespace,
            Culture = CultureInfo.InvariantCulture
        };

        // 设置缓冲区大小以提高性能
        SetParserBufferSize(parser, options.BufferSize);

        int lineNumber = 0;

        while (!parser.EndOfData)
        {
            lineNumber++;
            string[] fields;

            try
            {
                fields = parser.ReadFields();
                
                // 跳过空行
                if (options.SkipEmptyLines && fields.All(string.IsNullOrWhiteSpace))
                    continue;

                // 处理注释行
                if (options.CommentTokens != null && 
                    options.CommentTokens.Any(token => fields[0]?.StartsWith(token) == true))
                    continue;

                // 读取标题行
                if (lineNumber == 1 && options.HasHeader)
                {
                    headers = fields;
                    continue;
                }

                // 处理数据行
                if (options.HasHeader && headers != null)
                {
                    var record = new Dictionary<string, string>();
                    for (int i = 0; i < headers.Length && i < fields.Length; i++)
                    {
                        record[headers[i]] = fields[i];
                    }
                    records.Add(record);
                }
                else
                {
                    records.Add(fields.Select((field, index) => new { field, index })
                                     .ToDictionary(x => $"Column{x.index}", x => x.field));
                }
            }
            catch (MalformedLineException ex)
            {
                Console.WriteLine($"解析错误 行 {parser.LineNumber}: {ex.Message}");
                continue;
            }
            catch (Exception ex)
            {
                Console.WriteLine($"未知错误 行 {parser.LineNumber}: {ex.Message}");
                continue;
            }
        }

        return records;
    }

    private static void SetParserBufferSize(TextFieldParser parser, int bufferSize)
    {
        // TextFieldParser 没有直接设置缓冲区大小的属性
        // 但可以通过反射设置(不推荐在生产环境中使用)
        // 实际上,对于大文件,操作系统会自动管理缓冲区
    }
}

2. 高性能方案:自定义流式读取器

对于超大文件或性能要求极高的场景,自定义流式读取器是最佳选择。

高性能CSV读取器

public class HighPerformanceCsvReader : IDisposable
{
    private readonly StreamReader _reader;
    private readonly char _delimiter;
    private readonly char _quoteChar;
    private readonly bool _trimWhitespace;
    private long _bytesRead;
    private int _lineNumber;

    public HighPerformanceCsvReader(string filePath, char delimiter = ',', char quoteChar = '"', bool trimWhitespace = true)
    {
        _reader = new StreamReader(filePath, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, bufferSize: 65536);
        _delimiter = delimiter;
        _quoteChar = quoteChar;
        _trimWhitespace = trimWhitespace;
        _lineNumber = 0;
    }

    public IEnumerable<string[]> ReadRecords()
    {
        string line;
        while ((line = _reader.ReadLine()) != null)
        {
            _lineNumber++;
            _bytesRead += line.Length + Environment.NewLine.Length;

            // 跳过空行
            if (string.IsNullOrWhiteSpace(line))
                continue;

            yield return ParseLine(line);
        }
    }

    public IEnumerable<T> ReadRecords<T>(Func<string[], T> mapper) where T : class
    {
        foreach (var fields in ReadRecords())
        {
            yield return mapper(fields);
        }
    }

    private string[] ParseLine(string line)
    {
        var fields = new List<string>();
        var currentField = new StringBuilder();
        bool inQuotes = false;
        bool wasInQuotes = false;

        for (int i = 0; i < line.Length; i++)
        {
            char currentChar = line[i];
            char? nextChar = i < line.Length - 1 ? line[i + 1] : null;

            if (currentChar == _quoteChar)
            {
                if (inQuotes)
                {
                    // 检查是否是转义的引号 ("")
                    if (nextChar == _quoteChar)
                    {
                        currentField.Append(_quoteChar);
                        i++; // 跳过下一个引号
                    }
                    else
                    {
                        inQuotes = false;
                        wasInQuotes = true;
                    }
                }
                else
                {
                    inQuotes = true;
                }
            }
            else if (currentChar == _delimiter && !inQuotes)
            {
                string fieldValue = currentField.ToString();
                if (_trimWhitespace && !wasInQuotes)
                    fieldValue = fieldValue.Trim();
                
                fields.Add(fieldValue);
                currentField.Clear();
                wasInQuotes = false;
            }
            else
            {
                currentField.Append(currentChar);
            }
        }

        // 添加最后一个字段
        string lastField = currentField.ToString();
        if (_trimWhitespace && !wasInQuotes)
            lastField = lastField.Trim();
        fields.Add(lastField);

        return fields.ToArray();
    }

    public void Dispose()
    {
        _reader?.Dispose();
    }

    // 性能监控属性
    public long BytesRead => _bytesRead;
    public int LineNumber => _lineNumber;
}

使用示例

public class Person
{
    public string Name { get; set; }
    public int Age { get; set; }
    public string Email { get; set; }
}

public class HighPerformanceExample
{
    public static void ProcessLargeCsv()
    {
        var csvStats = new List<ProcessingStats>();
        
        using var reader = new HighPerformanceCsvReader("large_file.csv");
        
        var stopwatch = Stopwatch.StartNew();
        
        var persons = reader.ReadRecords(fields => new Person
        {
            Name = fields.Length > 0 ? fields[0] : null,
            Age = fields.Length > 1 && int.TryParse(fields[1], out int age) ? age : 0,
            Email = fields.Length > 2 ? fields[2] : null
        });

        int count = 0;
        foreach (var person in persons)
        {
            // 处理每条记录
            ProcessPerson(person);
            count++;

            // 每处理10000条记录显示进度
            if (count % 10000 == 0)
            {
                Console.WriteLine($"已处理 {count} 条记录,读取字节: {reader.BytesRead / 1024 / 1024} MB");
            }
        }

        stopwatch.Stop();
        Console.WriteLine($"处理完成: {count} 条记录,耗时: {stopwatch.Elapsed.TotalSeconds:F2} 秒");
    }

    private static void ProcessPerson(Person person)
    {
        // 实际的业务处理逻辑
        // 这里可以添加数据库保存、计算等操作
    }
}

3. 使用第三方库:CsvHelper

CsvHelper 是最流行的 CSV 处理库,功能强大且性能优异。

安装和基本使用

// Install-Package CsvHelper

using CsvHelper;
using CsvHelper.Configuration;
using System.Globalization;
using System.IO;

public class CsvHelperExample
{
    public class PersonMap : ClassMap<Person>
    {
        public PersonMap()
        {
            Map(m => m.Name).Name("Full Name");
            Map(m => m.Age).Name("Age").TypeConverterOption.CultureInfo(CultureInfo.InvariantCulture);
            Map(m => m.Email).Name("Email Address");
            Map(m => m.RegisterDate).Name("Register Date").TypeConverterOption.Format("yyyy-MM-dd");
        }
    }

    public static void ReadWithCsvHelper(string filePath)
    {
        var config = new CsvConfiguration(CultureInfo.InvariantCulture)
        {
            HasHeaderRecord = true,
            Delimiter = ",",
            TrimOptions = TrimOptions.Trim,
            MissingFieldFound = null, // 忽略缺少的字段
            HeaderValidated = null,   // 忽略头部验证错误
            BadDataFound = context => 
            {
                Console.WriteLine($"发现损坏数据: {context.RawRecord}");
            }
        };

        using var reader = new StreamReader(filePath);
        using var csv = new CsvReader(reader, config);
        
        // 注册映射
        csv.Context.RegisterClassMap<PersonMap>();
        
        // 读取所有记录
        var records = csv.GetRecords<Person>().ToList();
        
        Console.WriteLine($"读取了 {records.Count} 条记录");
    }

    public static void ReadWithAsync(string filePath)
    {
        var config = new CsvConfiguration(CultureInfo.InvariantCulture)
        {
            HasHeaderRecord = true,
            PrepareHeaderForMatch = args => args.Header.ToLower()
        };

        using var reader = new StreamReader(filePath);
        using var csv = new CsvReader(reader, config);

        // 异步读取
        var records = new List<Person>();
        while (csv.Read())
        {
            var record = csv.GetRecord<Person>();
            records.Add(record);
        }
    }

    public static void WriteCsvExample(string outputPath)
    {
        var records = new List<Person>
        {
            new Person { Name = "张三", Age = 25, Email = "zhangsan@example.com" },
            new Person { Name = "李四", Age = 30, Email = "lisi@example.com" }
        };

        using var writer = new StreamWriter(outputPath);
        using var csv = new CsvWriter(writer, CultureInfo.InvariantCulture);
        
        csv.WriteRecords(records);
    }
}

4. 性能对比和最佳实践

性能测试方法

public class PerformanceBenchmark
{
    public static void RunBenchmark(string testFile)
    {
        var methods = new Dictionary<string, Func<string, List<Person>>>
        {
            ["TextFieldParser"] = path => TextFieldParserMethod(path),
            ["HighPerformance"] = path => HighPerformanceMethod(path),
            ["CsvHelper"] = path => CsvHelperMethod(path)
        };

        foreach (var method in methods)
        {
            var stopwatch = Stopwatch.StartNew();
            var records = method.Value(testFile);
            stopwatch.Stop();

            Console.WriteLine($"{method.Key}: {records.Count} 条记录, " +
                              $"耗时: {stopwatch.Elapsed.TotalSeconds:F2}秒, " +
                              $"速度: {records.Count / stopwatch.Elapsed.TotalSeconds:F0} 条/秒");
        }
    }

    private static List<Person> TextFieldParserMethod(string path)
    {
        // TextFieldParser 实现
        return new List<Person>();
    }

    private static List<Person> HighPerformanceMethod(string path)
    {
        // 高性能读取器实现
        return new List<Person>();
    }

    private static List<Person> CsvHelperMethod(string path)
    {
        // CsvHelper 实现
        return new List<Person>();
    }
}

性能对比结果(示例)

方法10MB文件100MB文件1GB文件内存使用特点
TextFieldParser中等中等内置支持,易用性好
高性能流式最快最快最快最低最高性能,适合大文件
CsvHelper中等中等较高功能丰富,映射方便

5. 最佳实践总结

1. 选择合适的方案

  • 小文件 (<10MB)​​:使用 CsvHelper,开发效率高
  • 中等文件 (10MB-1GB)​​:使用 TextFieldParser,平衡性能和易用性
  • 大文件 (>1GB)​​:使用自定义流式读取器,内存效率最高

2. 性能优化技巧

// 1. 使用适当的缓冲区大小
using var reader = new StreamReader(filePath, Encoding.UTF8, true, bufferSize: 65536);

// 2. 异步处理(对于I/O密集型操作)
public async Task ProcessCsvAsync(string filePath)
{
    using var reader = new StreamReader(filePath);
    // 异步读取和处理
}

// 3. 批量处理而不是逐条处理
var batchSize = 1000;
var batch = new List<Person>(batchSize);
foreach (var record in records)
{
    batch.Add(record);
    if (batch.Count >= batchSize)
    {
        await SaveBatchToDatabase(batch);
        batch.Clear();
    }
}

// 4. 并行处理(注意线程安全)
var parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount };
Parallel.ForEach(records, parallelOptions, ProcessRecord);

3. 错误处理和验证

public class RobustCsvProcessor
{
    public class ValidationResult
    {
        public bool IsValid { get; set; }
        public List<string> Errors { get; set; } = new();
    }

    public ValidationResult ValidateRecord(Person person)
    {
        var result = new ValidationResult();
        
        if (string.IsNullOrEmpty(person.Name))
            result.Errors.Add("姓名不能为空");
            
        if (person.Age < 0 || person.Age > 150)
            result.Errors.Add("年龄必须在0-150之间");
            
        if (!IsValidEmail(person.Email))
            result.Errors.Add("邮箱格式不正确");
            
        result.IsValid = result.Errors.Count == 0;
        return result;
    }

    private bool IsValidEmail(string email)
    {
        try
        {
            var addr = new System.Net.Mail.MailAddress(email);
            return addr.Address == email;
        }
        catch
        {
            return false;
        }
    }
}

4. 内存管理

// 对于超大型文件,使用生产者-消费者模式
public class ProducerConsumerCsvReader
{
    private readonly BlockingCollection<string[]> _dataQueue = new(blockingCollectionSize: 1000);
    private const int MaxDegreeOfParallelism = 4;

    public void ProcessLargeFile(string filePath)
    {
        // 生产者线程
        var producer = Task.Run(() =>
        {
            using var reader = new HighPerformanceCsvReader(filePath);
            foreach (var record in reader.ReadRecords())
            {
                _dataQueue.Add(record);
            }
            _dataQueue.CompleteAdding();
        });

        // 消费者线程
        var consumers = Enumerable.Range(0, MaxDegreeOfParallelism)
            .Select(i => Task.Run(() => Consumer(i)))
            .ToArray();

        Task.WaitAll(new[] { producer }.Concat(consumers).ToArray());
    }

    private void Consumer(int consumerId)
    {
        foreach (var record in _dataQueue.GetConsumingEnumerable())
        {
            try
            {
                ProcessRecord(record);
            }
            catch (Exception ex)
            {
                Console.WriteLine($"消费者 {consumerId} 处理记录时出错: {ex.Message}");
            }
        }
    }
}

通过本文提供的各种方案和最佳实践,你可以根据具体需求选择最适合的CSV读取方法,在保证性能的同时处理各种复杂的CSV文件场景。

© 版权声明
THE END
喜欢就点个赞,支持一下吧!
点赞8 分享
评论 抢沙发
头像
欢迎您留下评论!
提交
头像

昵称

取消
昵称表情代码图片

    暂无评论内容