使用 happybase 输出分离的 HBase 列
Output separated HBase columns using happybase
我有这样的HBase-table:
total date1:tCount1 date2:tCount2 ...
url1 date1:clickCount1 date2:clickCount2 ...
url2 date1:clickCount1 date2:clickCount2 ...
...
url1, url2, ...
是行键。 table 有 只有一个 列族。
我有一个日期范围(从 datei
到 datej
)作为输入。
我需要在一天内为每个 url.
输出点击份额
输出必须有这样的格式:
datei url1:share1 url2:share1...
...
datej url1:share1 url2:share1...
其中
datei.url1:share1 = url1.datei:clickCount1 / total datei:tCount1
我开始编写 happybase-script,但我不知道,如何使用 happybase select 将列与行 分开。
我的 happybase 脚本如下:
import argparse
import calendar
import getpass
import happybase
import logging
import random
import sys
USAGE = """
To query daily data for a year, run:
$ {0} --action query --year 2014
To query daily data for a particular month, run:
$ {0} --action query --year 2014 --month 10
To query daily data for a particular day, run:
$ {0} --action query --year 2014 --month 10 --day 27
To compute totals add `--total` argument.
""".format(sys.argv[0])
logging.basicConfig(level="DEBUG")
HOSTS = ["bds%02d.vdi.mipt.ru" % i for i in xrange(7, 10)]
TABLE = "VisitCountPy-" + getpass.getuser()
def connect():
host = random.choice(HOSTS)
conn = happybase.Connection(host)
logging.debug("Connecting to HBase Thrift Server on %s", host)
conn.open()
if TABLE not in conn.tables():
# Create a table with column family `cf` with default settings.
conn.create_table(TABLE, {"cf": dict()})
logging.debug("Created table %s", TABLE)
else:
logging.debug("Using table %s", TABLE)
return happybase.Table(TABLE, conn)
def query(args, table):
r = list(get_time_range(args))
t = 0L
for key, data in table.scan(row_start=min(r), row_stop=max(r)):
if args.total:
t += long(data["cf:value"])
else:
print "%s\t%s" % (key, data["cf:value"])
if args.total:
print "total\t%s" % t
def get_time_range(args):
cal = calendar.Calendar()
years = [args.year]
months = [args.month] if args.month is not None else range(1, 1+12)
for year in years:
for month in months:
if args.day is not None:
days = [args.day]
else:
days = cal.itermonthdays(year, month)
for day in days:
if day > 0:
yield "%04d%02d%02d" % (year, month, day)
def main():
parser = argparse.ArgumentParser(description="An HBase example", usage=USAGE)
parser.add_argument("--action", metavar="ACTION", choices=("generate", "query"), required=True)
parser.add_argument("--year", type=int, required=True)
parser.add_argument("--month", type=int, default=None)
parser.add_argument("--day", type=int, default=None)
parser.add_argument("--total", action="store_true", default=False)
args = parser.parse_args()
table = connect()
if args.day is not None and args.month is None:
raise RuntimeError("Please, specify a month when specifying a day.")
if args.day is not None and (args.day < 0 or args.day > 31):
raise RuntimeError("Please, specify a valid day.")
query(args, table)
if __name__ == "__main__":
main()
那么,我应该如何更改我的脚本(实际上是 query()
函数)以获取定义日期范围内的分隔列?
我认为您应该使用扫描仪过滤器,您可以通过 scan(filter=...)
参数将其作为字符串提供(将在服务器上进行解释)。
请参阅 https://github.com/wbolster/happybase/issues/11 以获取一些指示(示例、文档)。
我有这样的HBase-table:
total date1:tCount1 date2:tCount2 ...
url1 date1:clickCount1 date2:clickCount2 ...
url2 date1:clickCount1 date2:clickCount2 ...
...
url1, url2, ...
是行键。 table 有 只有一个 列族。
我有一个日期范围(从 datei
到 datej
)作为输入。
我需要在一天内为每个 url.
输出必须有这样的格式:
datei url1:share1 url2:share1...
...
datej url1:share1 url2:share1...
其中
datei.url1:share1 = url1.datei:clickCount1 / total datei:tCount1
我开始编写 happybase-script,但我不知道,如何使用 happybase select 将列与行 分开。 我的 happybase 脚本如下:
import argparse
import calendar
import getpass
import happybase
import logging
import random
import sys
USAGE = """
To query daily data for a year, run:
$ {0} --action query --year 2014
To query daily data for a particular month, run:
$ {0} --action query --year 2014 --month 10
To query daily data for a particular day, run:
$ {0} --action query --year 2014 --month 10 --day 27
To compute totals add `--total` argument.
""".format(sys.argv[0])
logging.basicConfig(level="DEBUG")
HOSTS = ["bds%02d.vdi.mipt.ru" % i for i in xrange(7, 10)]
TABLE = "VisitCountPy-" + getpass.getuser()
def connect():
host = random.choice(HOSTS)
conn = happybase.Connection(host)
logging.debug("Connecting to HBase Thrift Server on %s", host)
conn.open()
if TABLE not in conn.tables():
# Create a table with column family `cf` with default settings.
conn.create_table(TABLE, {"cf": dict()})
logging.debug("Created table %s", TABLE)
else:
logging.debug("Using table %s", TABLE)
return happybase.Table(TABLE, conn)
def query(args, table):
r = list(get_time_range(args))
t = 0L
for key, data in table.scan(row_start=min(r), row_stop=max(r)):
if args.total:
t += long(data["cf:value"])
else:
print "%s\t%s" % (key, data["cf:value"])
if args.total:
print "total\t%s" % t
def get_time_range(args):
cal = calendar.Calendar()
years = [args.year]
months = [args.month] if args.month is not None else range(1, 1+12)
for year in years:
for month in months:
if args.day is not None:
days = [args.day]
else:
days = cal.itermonthdays(year, month)
for day in days:
if day > 0:
yield "%04d%02d%02d" % (year, month, day)
def main():
parser = argparse.ArgumentParser(description="An HBase example", usage=USAGE)
parser.add_argument("--action", metavar="ACTION", choices=("generate", "query"), required=True)
parser.add_argument("--year", type=int, required=True)
parser.add_argument("--month", type=int, default=None)
parser.add_argument("--day", type=int, default=None)
parser.add_argument("--total", action="store_true", default=False)
args = parser.parse_args()
table = connect()
if args.day is not None and args.month is None:
raise RuntimeError("Please, specify a month when specifying a day.")
if args.day is not None and (args.day < 0 or args.day > 31):
raise RuntimeError("Please, specify a valid day.")
query(args, table)
if __name__ == "__main__":
main()
那么,我应该如何更改我的脚本(实际上是 query()
函数)以获取定义日期范围内的分隔列?
我认为您应该使用扫描仪过滤器,您可以通过 scan(filter=...)
参数将其作为字符串提供(将在服务器上进行解释)。
请参阅 https://github.com/wbolster/happybase/issues/11 以获取一些指示(示例、文档)。