在 python 中生成倒排索引
generate inverted index in python
大家好,我是 python 的新人,我想研究 Hadoop Mapreduce。我有这样的数据
Vancouver-1 35.5
Vancouver-2 34.6
Vancouver-3 37.6
显示状态月份和最高温度
所以我想做一个这样的倒置数据
35 Vancouver-2
36 Vancouver-2 Vancouver-1
37 Vancouver-2 Vancouver-1
38 Vancouver-2 Vancouver-1 Vancouver-3
数字是D度从10到50,下一部分是D度等于或以下的州列表
我的映射器文件:
%%writefile mapper.py
#!/usr/bin/env python
import sys
import math
QueryMaxTemp = 50;
for line in sys.stdin:
line = line.rstrip('\n')
lfields = line.split('\t');
city_month = lfields[0];
maxtemp = math.ceil(float(lfields[1]));
for i in QueryMaxTemp:// I think this is wrong
print ('{}\t{}\t{}'.format(i,city_month,maxtemp))
我的减速器文件
%%writefile reducer.py
#!/usr/bin/env python
import sys
def emit(maxtemp, city_month_list):
print('{}\t{}'.format(maxtemp,city_month_list))
last_maxtemp = ''
last_city_month_list = ''
for line in sys.stdin:
line = line.rstrip('\n')
maxtemp, city_month_lists = line.split('\t', 1)
if last_maxtemp == maxtemp:
last_city_month_list = last_city_month_list + max(maxtemp, last_maxtemp) // I think this is wrong
else:
if last_maxtemp:
emit(last_maxtemp, last_city_month_list)
last_maxtemp = maxtemp
last_city_month_list = city_month_lists
if last_maxtemp:
emit(last_maxtemp, last_city_month_list)
我尝试修复它但不知道,有什么解决办法吗?我想制作一个像下面示例一样的倒置数据。谢谢
你可以使用dict来对数据进行排序。关键将是最高温度,然后将数据(城市月份)附加到列表中。
res_dict = {}
for line in sys.stdin:
line = line.rstrip('\n')
lfields = line.split('\t')
city_month = lfields[0]
maxtemp = math.ceil(float(lfields[1]))
if maxtemp not in res_dict:
res_dict[maxtemp] = []
res_dict[maxtemp].append(city_month)
for maxtemp, city_month in res_dict.iteritems()
print ('\t{}\t{}'.format(city_month,maxtemp))
import sys
def emit(res_dict):
for maxtemp, city_month in res_dict.iteritems()
print ('\t{}\t{}'.format(city_month,maxtemp))
res_dict
for line in sys.stdin:
line = line.rstrip('\n')
maxtemp, city_month_lists = line.split('\t', 1)
if maxtemp not in res_dict:
res_dict[maxtemp] = []
res_dict[maxtemp].append(city_month)
emit(res_dict)
抱歉,现在减少了
import math
data = """Vancouver-1 35.5
Vancouver-2 34.6
Vancouver-3 37.6"""
lines = data.split('\n')
mapped_data = list()
for line in lines:
city_month, maxtemp = line.split()
maxtemp = math.ceil(float(maxtemp))
mapped_data.append([city_month, maxtemp])
sorted_data = sorted(mapped_data, key=lambda x: x[1])
res = ''
cities_str = ''
for temp in range(10, 51):
if sorted_data and sorted_data[0][1] < temp:
cities_str += sorted_data.pop(0)[0]+' '
res += str(temp)+' '+cities_str+'\n'
print(res)
大家好,我是 python 的新人,我想研究 Hadoop Mapreduce。我有这样的数据
Vancouver-1 35.5
Vancouver-2 34.6
Vancouver-3 37.6
显示状态月份和最高温度 所以我想做一个这样的倒置数据
35 Vancouver-2
36 Vancouver-2 Vancouver-1
37 Vancouver-2 Vancouver-1
38 Vancouver-2 Vancouver-1 Vancouver-3
数字是D度从10到50,下一部分是D度等于或以下的州列表
我的映射器文件:
%%writefile mapper.py
#!/usr/bin/env python
import sys
import math
QueryMaxTemp = 50;
for line in sys.stdin:
line = line.rstrip('\n')
lfields = line.split('\t');
city_month = lfields[0];
maxtemp = math.ceil(float(lfields[1]));
for i in QueryMaxTemp:// I think this is wrong
print ('{}\t{}\t{}'.format(i,city_month,maxtemp))
我的减速器文件
%%writefile reducer.py
#!/usr/bin/env python
import sys
def emit(maxtemp, city_month_list):
print('{}\t{}'.format(maxtemp,city_month_list))
last_maxtemp = ''
last_city_month_list = ''
for line in sys.stdin:
line = line.rstrip('\n')
maxtemp, city_month_lists = line.split('\t', 1)
if last_maxtemp == maxtemp:
last_city_month_list = last_city_month_list + max(maxtemp, last_maxtemp) // I think this is wrong
else:
if last_maxtemp:
emit(last_maxtemp, last_city_month_list)
last_maxtemp = maxtemp
last_city_month_list = city_month_lists
if last_maxtemp:
emit(last_maxtemp, last_city_month_list)
我尝试修复它但不知道,有什么解决办法吗?我想制作一个像下面示例一样的倒置数据。谢谢
你可以使用dict来对数据进行排序。关键将是最高温度,然后将数据(城市月份)附加到列表中。
res_dict = {}
for line in sys.stdin:
line = line.rstrip('\n')
lfields = line.split('\t')
city_month = lfields[0]
maxtemp = math.ceil(float(lfields[1]))
if maxtemp not in res_dict:
res_dict[maxtemp] = []
res_dict[maxtemp].append(city_month)
for maxtemp, city_month in res_dict.iteritems()
print ('\t{}\t{}'.format(city_month,maxtemp))
import sys
def emit(res_dict):
for maxtemp, city_month in res_dict.iteritems()
print ('\t{}\t{}'.format(city_month,maxtemp))
res_dict
for line in sys.stdin:
line = line.rstrip('\n')
maxtemp, city_month_lists = line.split('\t', 1)
if maxtemp not in res_dict:
res_dict[maxtemp] = []
res_dict[maxtemp].append(city_month)
emit(res_dict)
抱歉,现在减少了
import math
data = """Vancouver-1 35.5
Vancouver-2 34.6
Vancouver-3 37.6"""
lines = data.split('\n')
mapped_data = list()
for line in lines:
city_month, maxtemp = line.split()
maxtemp = math.ceil(float(maxtemp))
mapped_data.append([city_month, maxtemp])
sorted_data = sorted(mapped_data, key=lambda x: x[1])
res = ''
cities_str = ''
for temp in range(10, 51):
if sorted_data and sorted_data[0][1] < temp:
cities_str += sorted_data.pop(0)[0]+' '
res += str(temp)+' '+cities_str+'\n'
print(res)