搜索 + 反向搜索腌制文件,跳过值
Searching + reverse seeking a pickled file , values getting skipped
最小可重现示例,我的代码中仅使用 goto_index()
。其余的不言自明:
import pickle,os
def goto_index(idx_str,src,dest=False) :
'''Go to index :
1. Convert 1-based comma seperated digits in idx_str into 0-based list containing each index digit as int.
2. Starting from current position of src, iterate until index[0] matches current objec's position.
If matched, try to index the object as given. If not matched, function raises EOFError. If index illegal
in object function raises IndexError.If object found and index found in object, return value found and
seek src to begining of object @ index.
3. If dest is specified, all values until index will be copied to it from it's current position.
If element is not found in src, the result will be that all elements from src's current positon
to EOF are copied to dest.
'''
index = [int(subidx)-1 for subidx in idx_str.split(',')]
val = None
obj_cnt = -1 # 0-based count
try :
while True : # EOFError if index[0] >= EOF point
obj = pickle.load(src)
obj_cnt += 1
if obj_cnt == index[0] :
val = obj
for subidx in index[1::] :
val = val[subidx] # IndexError if index illegal
src.seek(-len(pickle.dumps(obj)),os.SEEK_CUR) # Seek to start of object at index
return val
elif dest : pickle.dump(obj,dest)
except (EOFError,IndexError) : raise # Caller will handle exceptions
def add_elements(f) :
pickle.dump('hello world',f)
pickle.dump('good morning',f)
pickle.dump('69 420',f)
pickle.dump('ending !',f)
def get_elements(f) :
elements = []
# Actual code similarly calls goto_index() in ascending order of indices, avoiding repeated seeks.
for idx_str in ('1','2','3') :
elements.append(goto_index(idx_str,f))
return elements
with open("tmp","wb+") as tmp :
add_elements(tmp)
print(', '.join(get_elements(tmp)))
'''Expected output : hello world, good morning, 69 420
Actual output : hello world, good morning, ending !
Issue : When asking for 3rd element, 3rd element skipped, 4th returned, why ?
'''
编辑:问题在于 goto_index()
在每次调用时将 obj_cnt
设置为 -1
。如何缓解这种情况?
问题是以下各项的组合:
obj_cnt
在函数调用期间不是持久的,所以总是从头开始,即使在每次调用中修改了文件位置,所以 goto_idx()
就像它在 BOF 一样,但实际上会很多提前。
- 在索引 (
src.seek(-len(pickle.dumps(obj)),os.SEEK_CUR)
) 处寻求对象的开始导致下一次读取读取与之前相同的对象 - 如果修复了之前的错误,这将导致 goto_index()
总是 从它的第一次调用开始并返回索引处的对象。
我修复了它 a) 将函数放在 class 中它可以访问计数变量,b) 添加一个额外的标志 fp_set
并且仅在它设置为 a 时才返回真值,c) 在 class 中提供一个 reset()
方法,以便在完成一系列有序查询后将 obj_cnt
重置为 -1
。
请记住,我对 python 中的 OOP 还很陌生,下面的代码中可能有些地方很奇怪:
class goto_index:
obj_cnt = -1 # 0-based count
def sorted(idx_str,src,dest=None,fp_set=False) :
#Use if going to indexes in ascending order in loop
# idx_str = comma-seperated index , eg : "7,8" like foo[7][8]
# src = file object to search in, from it's current position
# dest = if True, will copy all objects until obj @ idx_str found OR EOF
# fp_set = if True, will seek such that next read will return obj @ idx_str
index = [int(subidx)-1 for subidx in idx_str.split(',')]
# Make 0-based int list from 1-based csv string
val = None
try :
while True : # EOFError if not found
obj = pickle.load(src)
goto_index.obj_cnt += 1 # increment counter
if goto_index.obj_cnt == index[0] : # 1st element of index is object number
val = obj
for subidx in index[1::] : # Index the object itself
val = val[subidx] # IndexError if illegal index
if fp_set : src.seek(-len(pickle.dumps(obj)),os.SEEK_CUR)
# Seek back to begining of object in src
return val # Return value @ index
elif dest : pickle.dump(obj,dest) # Copy object to dest
except (EOFError, IndexError) : raise # Caller handles these
def reset():
goto_index.obj_cnt = -1
def random(idx_str,src,dest=None,fp_set=False) :
goto_index.reset() # Just in case
src.seek(0) # Just in case
goto_index.sorted(idx_str,src,dest=None,fp_set=False)
goto_index.reset() # Clear count
除了fetch_elements()
之外,问题的其他功能基本相同:
def fetch_elements(f) :
elements = []
for idx_str in ('1','2','3') : # Indexes are passed sorted
elements.append(goto_index.sorted(idx_str,f))
goto_index.reset() # Required if using the methods later
return elements
最小可重现示例,我的代码中仅使用 goto_index()
。其余的不言自明:
import pickle,os
def goto_index(idx_str,src,dest=False) :
'''Go to index :
1. Convert 1-based comma seperated digits in idx_str into 0-based list containing each index digit as int.
2. Starting from current position of src, iterate until index[0] matches current objec's position.
If matched, try to index the object as given. If not matched, function raises EOFError. If index illegal
in object function raises IndexError.If object found and index found in object, return value found and
seek src to begining of object @ index.
3. If dest is specified, all values until index will be copied to it from it's current position.
If element is not found in src, the result will be that all elements from src's current positon
to EOF are copied to dest.
'''
index = [int(subidx)-1 for subidx in idx_str.split(',')]
val = None
obj_cnt = -1 # 0-based count
try :
while True : # EOFError if index[0] >= EOF point
obj = pickle.load(src)
obj_cnt += 1
if obj_cnt == index[0] :
val = obj
for subidx in index[1::] :
val = val[subidx] # IndexError if index illegal
src.seek(-len(pickle.dumps(obj)),os.SEEK_CUR) # Seek to start of object at index
return val
elif dest : pickle.dump(obj,dest)
except (EOFError,IndexError) : raise # Caller will handle exceptions
def add_elements(f) :
pickle.dump('hello world',f)
pickle.dump('good morning',f)
pickle.dump('69 420',f)
pickle.dump('ending !',f)
def get_elements(f) :
elements = []
# Actual code similarly calls goto_index() in ascending order of indices, avoiding repeated seeks.
for idx_str in ('1','2','3') :
elements.append(goto_index(idx_str,f))
return elements
with open("tmp","wb+") as tmp :
add_elements(tmp)
print(', '.join(get_elements(tmp)))
'''Expected output : hello world, good morning, 69 420
Actual output : hello world, good morning, ending !
Issue : When asking for 3rd element, 3rd element skipped, 4th returned, why ?
'''
编辑:问题在于 goto_index()
在每次调用时将 obj_cnt
设置为 -1
。如何缓解这种情况?
问题是以下各项的组合:
obj_cnt
在函数调用期间不是持久的,所以总是从头开始,即使在每次调用中修改了文件位置,所以goto_idx()
就像它在 BOF 一样,但实际上会很多提前。- 在索引 (
src.seek(-len(pickle.dumps(obj)),os.SEEK_CUR)
) 处寻求对象的开始导致下一次读取读取与之前相同的对象 - 如果修复了之前的错误,这将导致goto_index()
总是 从它的第一次调用开始并返回索引处的对象。
我修复了它 a) 将函数放在 class 中它可以访问计数变量,b) 添加一个额外的标志 fp_set
并且仅在它设置为 a 时才返回真值,c) 在 class 中提供一个 reset()
方法,以便在完成一系列有序查询后将 obj_cnt
重置为 -1
。
请记住,我对 python 中的 OOP 还很陌生,下面的代码中可能有些地方很奇怪:
class goto_index:
obj_cnt = -1 # 0-based count
def sorted(idx_str,src,dest=None,fp_set=False) :
#Use if going to indexes in ascending order in loop
# idx_str = comma-seperated index , eg : "7,8" like foo[7][8]
# src = file object to search in, from it's current position
# dest = if True, will copy all objects until obj @ idx_str found OR EOF
# fp_set = if True, will seek such that next read will return obj @ idx_str
index = [int(subidx)-1 for subidx in idx_str.split(',')]
# Make 0-based int list from 1-based csv string
val = None
try :
while True : # EOFError if not found
obj = pickle.load(src)
goto_index.obj_cnt += 1 # increment counter
if goto_index.obj_cnt == index[0] : # 1st element of index is object number
val = obj
for subidx in index[1::] : # Index the object itself
val = val[subidx] # IndexError if illegal index
if fp_set : src.seek(-len(pickle.dumps(obj)),os.SEEK_CUR)
# Seek back to begining of object in src
return val # Return value @ index
elif dest : pickle.dump(obj,dest) # Copy object to dest
except (EOFError, IndexError) : raise # Caller handles these
def reset():
goto_index.obj_cnt = -1
def random(idx_str,src,dest=None,fp_set=False) :
goto_index.reset() # Just in case
src.seek(0) # Just in case
goto_index.sorted(idx_str,src,dest=None,fp_set=False)
goto_index.reset() # Clear count
除了fetch_elements()
之外,问题的其他功能基本相同:
def fetch_elements(f) :
elements = []
for idx_str in ('1','2','3') : # Indexes are passed sorted
elements.append(goto_index.sorted(idx_str,f))
goto_index.reset() # Required if using the methods later
return elements