读取大文本文件非常慢
Reading large text file very slow
所以我被赋予了编写一个 vb 程序的任务,我在其中读取一个大的 .txt 文件(从 500mb 到 2GB 不等),这个文件通常以 13 位数字开头,然后加载每行之后的其他信息。
(例如“1578597500548 info info info info 等”)我必须让用户输入一个 13 位数字,然后我的程序在每一行的开头搜索该数字的大文件,如果找到,则将整行写入一个新的 . txt文件!
我当前的程序运行完美,但我注意到我添加到 list/streamreader 部分占用了大约 90% 的处理时间。平均每 运行 大约 27 秒。任何想法如何加快?
这是我写的。
Private Sub Button2_Click(sender As Object, e As EventArgs) Handles Button2.Click
Dim wtr As IO.StreamWriter
Dim listy As New List(Of String)
Dim i = 0
stpw.Reset()
stpw.Start()
'reading in file of large data 700mb and larger
Using Reader As New StreamReader("G:\USER\FOLDER\tester.txt")
While Reader.EndOfStream = False
listy.Add(Reader.ReadLine)
End While
End Using
'have a textbox which finds user query number
Dim result = From n In listy
Where n.StartsWith(TextBox1.Text)
Select n
'writes results found into new file
wtr = New StreamWriter("G:\USER\searched-number.txt")
For Each word As String In result
wtr.WriteLine(word)
Next
wtr.Close()
stpw.Stop()
Debug.WriteLine(stpw.Elapsed.TotalMilliseconds)
Application.Exit()
End Sub
UPDATE 我采纳了一些建议,不要先把它放入列表,只搜索内存,时间大约是 5秒更快,仍然需要 23 秒才能完成,并且它会在我正在搜索的数字上方写出一行,所以如果你能告诉我哪里出错了。谢谢大家!
wtr = New StreamWriter("G:\Karl\searchednumber.txt")
Using Reader As New StreamReader("G:\Karl\AC\tester.txt")
While Reader.EndOfStream = False
lineIn = Reader.ReadLine
If Reader.ReadLine.StartsWith(TextBox1.Text) Then
wtr.WriteLine(lineIn)
Else
Continue While
End If
End While
wtr.Close()
End Using
程序加载时索引文件。
创建一个Dictionary(Of ULong, Long)
,并在程序加载时读取文件。对于每一行,向字典中添加一个条目,将每行前面的 13 位值显示为 ULong 键,将文件流中的位置显示为 Long 值。
然后,当用户输入密钥时,您可以立即检查字典,以在磁盘上找到您需要的确切位置并直接在那里查找。
在程序启动时建立文件索引可能需要一些时间,但您只需一次。现在,您要么需要在每次用户想要搜索时搜索整个内容,要么在内存中保留数百兆字节的文本文件数据。一旦你有了索引,在字典中查找一个值然后直接查找它应该几乎是立即发生的。
我刚看到这条评论:
there could be more than 1 occurrences of a 13 digit number so must search the whole file.
基于此,索引应该是 Dictionary(Of ULong, List(Of Long))
,其中向条目添加值首先创建一个列表实例(如果不存在),然后将新值添加到列表中。
这是在没有测试数据或 Visual Studio 的帮助下直接输入回复 window 的基本尝试,因此可能仍然包含一些错误:
Public Class MyFileIndexer
Private initialCapacity As Integer = 1
Private Property FilePath As String
Private Index As Dictionary(Of ULong, List(Of Long))
Public Sub New(filePath As String)
Me.FilePath = filePath
RebuildIndex()
End Sub
Public Sub RebuildIndex()
Index = New Dictionary(Of ULong, List(Of Long))()
Using sr As New StreamReader(FilePath)
Dim Line As String = sr.ReadLine()
Dim position As Long = 0
While Line IsNot Nothing
'Process this line
If Line.Length > 13 Then
Dim key As ULong = ULong.Parse(Line.SubString(0, 13))
Dim item As List(Of Long)
If Not Index.TryGetValue(key, item) Then
item = New List(Of Long)(initialCapacity)
Index.Add(key, item)
End If
item.Add(position)
End If
'Prep for next line
position = sr.BaseStream.Position
Line = sr.ReadLine()
End While
End Using
End Sub
'Expect key to be a 13-character numeric string
Public Function Search(key As String) As List(Of String)
'Will throw an exception if parsing fails. Be prepared for that.
Dim realKey As ULong = ULong.Parse(key)
Return Search(realKey)
End Function
Public Function Search(key As ULong) As List(Of String)
Dim lines As List(Of Long)
If Not Index.TryGetValue(key, lines) Then Return Nothing
Dim result As New List(Of String)()
Using sr As New StreamReader(FilePath)
For Each position As Long In lines
sr.BaseStream.Seek(position, SeekOrigin.Begin)
result.Add(sr.ReadLine())
Next position
End Using
Return Result
End Function
End Class
'Somewhere public, when your application starts up:
Public Index As New MyFileIndexer("G:\USER\FOLDER\tester.txt")
Private Sub Button2_Click(sender As Object, e As EventArgs) Handles Button2.Click
Dim lines As List(Of String) = Nothing
Try
lines = Index.Search(TextBox1.Text)
Catch
'Do something here
End Try
If lines IsNot Nothing Then
Using sw As New StreamWriter($"G:\USER\{TextBox1.Text}.txt")
For Each line As String in lines
sw.WriteLine(line)
Next
End Using
End If
End Sub
为了好玩,这是 class 的通用版本,它允许您提供自己的键选择器函数来索引 any 文件,该文件每行存储一个键,我认为它通常对更大的 csv 数据集有用。
Public Class MyFileIndexer(Of TKey)
Private initialCapacity As Integer = 1
Private Property FilePath As String
Private Index As Dictionary(Of TKey, List(Of Long))
Private GetKey As Func(Of String, TKey)
Public Sub New(filePath As String, Func(Of String, TKey) keySelector)
Me.FilePath = filePath
Me.GetKey = keySelector
RebuildIndex()
End Sub
Public Sub RebuildIndex()
Index = New Dictionary(Of TKey, List(Of Long))()
Using sr As New StreamReader(FilePath)
Dim Line As String = sr.ReadLine()
Dim position As Long = 0
While Line IsNot Nothing
Dim key As TKey = GetKey(Line)
Dim item As List(Of Long)
If Not Index.TryGetValue(key, item) Then
item = New List(Of Long)(initialCapacity)
Index.Add(key, item)
End If
item.Add(position)
'Prep for next line
position = sr.BaseStream.Position
Line = sr.ReadLine()
End While
End Using
End Sub
Public Function Search(key As TKey) As List(Of String)
Dim lines As List(Of Long)
If Not Index.TryGetValue(key, lines) Then Return Nothing
Dim result As New List(Of String)()
Using sr As New StreamReader(FilePath)
For Each position As Long In lines
sr.BaseStream.Seek(position, SeekOrigin.Begin)
result.Add(sr.ReadLine())
Next position
End Using
Return Result
End Function
End Class
所以我被赋予了编写一个 vb 程序的任务,我在其中读取一个大的 .txt 文件(从 500mb 到 2GB 不等),这个文件通常以 13 位数字开头,然后加载每行之后的其他信息。 (例如“1578597500548 info info info info 等”)我必须让用户输入一个 13 位数字,然后我的程序在每一行的开头搜索该数字的大文件,如果找到,则将整行写入一个新的 . txt文件!
我当前的程序运行完美,但我注意到我添加到 list/streamreader 部分占用了大约 90% 的处理时间。平均每 运行 大约 27 秒。任何想法如何加快? 这是我写的。
Private Sub Button2_Click(sender As Object, e As EventArgs) Handles Button2.Click
Dim wtr As IO.StreamWriter
Dim listy As New List(Of String)
Dim i = 0
stpw.Reset()
stpw.Start()
'reading in file of large data 700mb and larger
Using Reader As New StreamReader("G:\USER\FOLDER\tester.txt")
While Reader.EndOfStream = False
listy.Add(Reader.ReadLine)
End While
End Using
'have a textbox which finds user query number
Dim result = From n In listy
Where n.StartsWith(TextBox1.Text)
Select n
'writes results found into new file
wtr = New StreamWriter("G:\USER\searched-number.txt")
For Each word As String In result
wtr.WriteLine(word)
Next
wtr.Close()
stpw.Stop()
Debug.WriteLine(stpw.Elapsed.TotalMilliseconds)
Application.Exit()
End Sub
UPDATE 我采纳了一些建议,不要先把它放入列表,只搜索内存,时间大约是 5秒更快,仍然需要 23 秒才能完成,并且它会在我正在搜索的数字上方写出一行,所以如果你能告诉我哪里出错了。谢谢大家!
wtr = New StreamWriter("G:\Karl\searchednumber.txt")
Using Reader As New StreamReader("G:\Karl\AC\tester.txt")
While Reader.EndOfStream = False
lineIn = Reader.ReadLine
If Reader.ReadLine.StartsWith(TextBox1.Text) Then
wtr.WriteLine(lineIn)
Else
Continue While
End If
End While
wtr.Close()
End Using
程序加载时索引文件。
创建一个Dictionary(Of ULong, Long)
,并在程序加载时读取文件。对于每一行,向字典中添加一个条目,将每行前面的 13 位值显示为 ULong 键,将文件流中的位置显示为 Long 值。
然后,当用户输入密钥时,您可以立即检查字典,以在磁盘上找到您需要的确切位置并直接在那里查找。
在程序启动时建立文件索引可能需要一些时间,但您只需一次。现在,您要么需要在每次用户想要搜索时搜索整个内容,要么在内存中保留数百兆字节的文本文件数据。一旦你有了索引,在字典中查找一个值然后直接查找它应该几乎是立即发生的。
我刚看到这条评论:
there could be more than 1 occurrences of a 13 digit number so must search the whole file.
基于此,索引应该是 Dictionary(Of ULong, List(Of Long))
,其中向条目添加值首先创建一个列表实例(如果不存在),然后将新值添加到列表中。
这是在没有测试数据或 Visual Studio 的帮助下直接输入回复 window 的基本尝试,因此可能仍然包含一些错误:
Public Class MyFileIndexer
Private initialCapacity As Integer = 1
Private Property FilePath As String
Private Index As Dictionary(Of ULong, List(Of Long))
Public Sub New(filePath As String)
Me.FilePath = filePath
RebuildIndex()
End Sub
Public Sub RebuildIndex()
Index = New Dictionary(Of ULong, List(Of Long))()
Using sr As New StreamReader(FilePath)
Dim Line As String = sr.ReadLine()
Dim position As Long = 0
While Line IsNot Nothing
'Process this line
If Line.Length > 13 Then
Dim key As ULong = ULong.Parse(Line.SubString(0, 13))
Dim item As List(Of Long)
If Not Index.TryGetValue(key, item) Then
item = New List(Of Long)(initialCapacity)
Index.Add(key, item)
End If
item.Add(position)
End If
'Prep for next line
position = sr.BaseStream.Position
Line = sr.ReadLine()
End While
End Using
End Sub
'Expect key to be a 13-character numeric string
Public Function Search(key As String) As List(Of String)
'Will throw an exception if parsing fails. Be prepared for that.
Dim realKey As ULong = ULong.Parse(key)
Return Search(realKey)
End Function
Public Function Search(key As ULong) As List(Of String)
Dim lines As List(Of Long)
If Not Index.TryGetValue(key, lines) Then Return Nothing
Dim result As New List(Of String)()
Using sr As New StreamReader(FilePath)
For Each position As Long In lines
sr.BaseStream.Seek(position, SeekOrigin.Begin)
result.Add(sr.ReadLine())
Next position
End Using
Return Result
End Function
End Class
'Somewhere public, when your application starts up:
Public Index As New MyFileIndexer("G:\USER\FOLDER\tester.txt")
Private Sub Button2_Click(sender As Object, e As EventArgs) Handles Button2.Click
Dim lines As List(Of String) = Nothing
Try
lines = Index.Search(TextBox1.Text)
Catch
'Do something here
End Try
If lines IsNot Nothing Then
Using sw As New StreamWriter($"G:\USER\{TextBox1.Text}.txt")
For Each line As String in lines
sw.WriteLine(line)
Next
End Using
End If
End Sub
为了好玩,这是 class 的通用版本,它允许您提供自己的键选择器函数来索引 any 文件,该文件每行存储一个键,我认为它通常对更大的 csv 数据集有用。
Public Class MyFileIndexer(Of TKey)
Private initialCapacity As Integer = 1
Private Property FilePath As String
Private Index As Dictionary(Of TKey, List(Of Long))
Private GetKey As Func(Of String, TKey)
Public Sub New(filePath As String, Func(Of String, TKey) keySelector)
Me.FilePath = filePath
Me.GetKey = keySelector
RebuildIndex()
End Sub
Public Sub RebuildIndex()
Index = New Dictionary(Of TKey, List(Of Long))()
Using sr As New StreamReader(FilePath)
Dim Line As String = sr.ReadLine()
Dim position As Long = 0
While Line IsNot Nothing
Dim key As TKey = GetKey(Line)
Dim item As List(Of Long)
If Not Index.TryGetValue(key, item) Then
item = New List(Of Long)(initialCapacity)
Index.Add(key, item)
End If
item.Add(position)
'Prep for next line
position = sr.BaseStream.Position
Line = sr.ReadLine()
End While
End Using
End Sub
Public Function Search(key As TKey) As List(Of String)
Dim lines As List(Of Long)
If Not Index.TryGetValue(key, lines) Then Return Nothing
Dim result As New List(Of String)()
Using sr As New StreamReader(FilePath)
For Each position As Long In lines
sr.BaseStream.Seek(position, SeekOrigin.Begin)
result.Add(sr.ReadLine())
Next position
End Using
Return Result
End Function
End Class