日期时间对象比较
Datetime object compare
我正在尝试查找最近 2 小时内发布的最新新闻。
方法
我的目标是获取今天的 datetime
对象并将其与我从网上抓取的文章的日期进行比较。
首先我按日期比较日期时间,然后按小时比较。
问题
然而,即使给出了正确的日期,它似乎也说它不在正确的范围内。
假失败:
代码
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from datetime import timedelta
from datetime import datetime
def newz(stock_1):
list_now=stock_1
#list_now=chr(list_now)
new_list=list_now
list_now=[list_now]
print("Stock:{}".format(list_now))
n = 1 #the # of article headlines displayed per ticker
tickers= list_now
new_words = {
'Insider Sells':-3.4,
'common':2.0,
'up':3.4,
'bankruptcy':-3.4,
'underperforms':-3.4,
'overperforms':3.4,
'outperforms':3.4,
'overbought':-3.4,
'oversold':3.4,
'down':2.0,
}
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
for ticker in tickers:
url = finviz_url + ticker
req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'})
resp = urlopen(req)
html = BeautifulSoup(resp, features="lxml")
#print(html)
news_table = html.find(id='news-table')
news_tables[ticker] = news_table
try:
for ticker in tickers:
df = news_tables[ticker]
df_tr = df.findAll('tr')
print ('\n')
print ('Recent News Headlines for {}: '.format(ticker))
for i, table_row in enumerate(df_tr):
a_text = table_row.a.text
td_text = table_row.td.text
td_text = td_text.strip()
print("{0} {1}".format(a_text,td_text))
td_text=str(td_text)
a_text=str(a_text)
# looks specfically for investigation or shareholder alert or investigation alert
result=a_text.find("INVESTIGATION")
result=int(result)
result_1=a_text.find("SHAREHOLDER")
result_1=int(result_1)
result_2=a_text.find("ALERT")
result_2=int(result_2)
result_3=a_text.find("INVESTOR")
result_3=int(result_3)
result_4=a_text.find("NOTICE")
result_4=int(result_4)
if (result>=0 or result_1>=0) and result_2>=0:
print("Fails: Under Investigation")
return True
elif result>=0 :
print("Fails: Under Investigation")
return True
elif result_3>=0 and result_2>=0 :
print("Fails: Under Investigation")
elif result_1>=0 and result_4>=0 :
print("Fails: Under Investigation")
elif result_1>=0 and result_2>=0 :
print("Fails: Under Investigation")
elif result==-1 and result_1==-1 and result_2==-1:
pass
if i == n-1:
break
except AttributeError:
return True
td_text=td_text.split("-",2)
#print("A",td_text)
month=td_text[0]
day=td_text[1]
year=td_text[2]
months={'Jan': 1, 'Feb':2, 'Mar': 3, 'Apr':4,'May':5, 'Jun': 6, 'Jul':7,'Aug':8,'Sep':9,'Oct':10, 'Nov':11,'Dec':12}
month=months.pop(month)
#print("MOnth",month)
#print("Year",year)
#print("Day",day)
hour=year[3:5]
#print("Hour data",hour)
meridian=year[-1:-2]
#print("Meridian",meridian)
year=year[0:2]
year=str(year)
day=str(day)
month=str(month)
year="20"+year
# date we are stripping from web
t=year+'-'+month+'-'+day+'-'+hour
#print(t)
t_hour = datetime.strptime(hour, '%H')
t = datetime.strptime(t, '%Y-%m-%d-%H')
#print(t)
# todays date in datetime object
today =datetime.today().strftime('%Y-%m-%d-%H')
today = datetime.strptime(today, '%Y-%m-%d-%H')
hr_margin=timedelta(hours= 2)
margin = timedelta(days = 1)
#print( "Earnings date:{} ".format(t))
#print("Today:",today)
diff_minus=today - margin
#print(diff_minus)
diff_plus=today + margin
diff_hr_plus= today +hr_margin
diff_hr_minus= today-hr_margin
#print(diff_plus)
#t_hour=t_hour[10::]
if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
return True
def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
if diff_plus<= t <= diff_plus:
print("Day is good")
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print('Stripped Datetime {}'.format(t))
#print("Measured time hr",t_hour)
if diff_hr_minus<=t_hour <=diff_hr_plus:
print("Hour is good")
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print('Stripped Datetime {}'.format(t))
print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("\n")
return False
else:
print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print("\n")
return True
else:
print("News is NOT up to date by Day!! {} ".format(t))
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print("\n")
return True
看看我如何通过几个步骤将您给定的代码简化为 minimal reproducible example:
- 删除 web-scraping(因为对问题而言不是必需的)
- 去掉所有不解释的评论
- 删除对结构没有帮助的空行
- (可选)添加显示问题的测试(例如 function-call)
已解决问题
由于您声称日期时间比较存在问题,我发现了一个不一致之处:
在日比较中你有:
if diff_plus<= t <= diff_plus:
小时比较你有:
if diff_hr_minus<=t_hour <=diff_hr_plus:
需要解决的问题:
最小可重现示例
from datetime import timedelta
from datetime import datetime
def compared_date_from_td(td_text):
"""Returns datetime for input of format 'Jan-24-22 05:48PM'."""
# date from web
t = datetime.strptime(td_text.strip(), '%b-%d-%y %H:%M%p')
print("Earnings date: {} ".format(t))
# todays date in datetime object
today = datetime.today()
print("Today: {}".format(today))
margin = timedelta(days = 1)
diff_minus = today - margin
diff_plus = today + margin
t_hour = t.hour # get the hour part of datetime t
diff_hr_plus = today.hour + 2
diff_hr_minus = today.hour - 2
if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
return True
def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
if diff_minus <= t <= diff_plus: # issue fixed! was: diff_plus<= t <= diff_plus
print("Day is good")
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print('Stripped Datetime {}'.format(t))
#print("Measured time hr",t_hour)
if diff_hr_minus <= t_hour <= diff_hr_plus:
print("Hour is good")
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print('Stripped Datetime {}'.format(t))
print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("\n")
return False
else:
print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print("\n")
return True
else:
print("News is NOT up to date by Day!! {} ".format(t))
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print("\n")
return True
date_text = 'Jan-24-22 05:48PM'
not_uptodate = compared_date_from_td(date_text)
print("date: {}, compared as not_uptodate => {}".format(date_text, not_uptodate))
这输出:
Earnings date: 2022-01-24 05:48:00
Today: 2022-01-25 00:54:16.122160
Day is good
Max allowed date 2022-01-26 00:54:16.122160
Min allowed date 2022-01-24 00:54:16.122160
Stripped Datetime 2022-01-24 05:48:00
News is NOT up to date by Hour!! Time :2022-01-25 00:54:16.122160 Story Hit: 2022-01-24 05:48:00
Max allowed Hr 2
Min allowed Hr -2
date: Jan-24-22 05:48PM, compared as not_uptodate => True
注意:某些格式需要调整才能有意义,例如 Max allowed Hr 2
等。还有比较 returns True
if outside margins as "NOT up to date" (就像在上面的例子中,小时相差超过 2 小时)。
正确的解决方案
我的目标是否正确:测试日期时间是否在最近 2 小时内。
(1) 给定一段从网络上抓取的文本,您将确定格式并将其解析为日期时间。
(2) 给定日期时间,您将计算到现在的时间增量。然后测试这个是否小于2小时:
from datetime import datetime, timedelta
scrapedText = 'Jan-24-22 05:48PM'
newsTime = datetime.strptime(scrapedText, '%b-%d-%y %H:%M%p') # (1) parse datetime
if datetime.now() - newsTime < timedelta(hours=2): # (2) whithin last 2 hours
# news is less than 2 hours ago
另请参阅:
How to find information from the last 24 hours
我正在尝试查找最近 2 小时内发布的最新新闻。
方法
我的目标是获取今天的 datetime
对象并将其与我从网上抓取的文章的日期进行比较。
首先我按日期比较日期时间,然后按小时比较。
问题
然而,即使给出了正确的日期,它似乎也说它不在正确的范围内。
假失败:
代码
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from datetime import timedelta
from datetime import datetime
def newz(stock_1):
list_now=stock_1
#list_now=chr(list_now)
new_list=list_now
list_now=[list_now]
print("Stock:{}".format(list_now))
n = 1 #the # of article headlines displayed per ticker
tickers= list_now
new_words = {
'Insider Sells':-3.4,
'common':2.0,
'up':3.4,
'bankruptcy':-3.4,
'underperforms':-3.4,
'overperforms':3.4,
'outperforms':3.4,
'overbought':-3.4,
'oversold':3.4,
'down':2.0,
}
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
for ticker in tickers:
url = finviz_url + ticker
req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'})
resp = urlopen(req)
html = BeautifulSoup(resp, features="lxml")
#print(html)
news_table = html.find(id='news-table')
news_tables[ticker] = news_table
try:
for ticker in tickers:
df = news_tables[ticker]
df_tr = df.findAll('tr')
print ('\n')
print ('Recent News Headlines for {}: '.format(ticker))
for i, table_row in enumerate(df_tr):
a_text = table_row.a.text
td_text = table_row.td.text
td_text = td_text.strip()
print("{0} {1}".format(a_text,td_text))
td_text=str(td_text)
a_text=str(a_text)
# looks specfically for investigation or shareholder alert or investigation alert
result=a_text.find("INVESTIGATION")
result=int(result)
result_1=a_text.find("SHAREHOLDER")
result_1=int(result_1)
result_2=a_text.find("ALERT")
result_2=int(result_2)
result_3=a_text.find("INVESTOR")
result_3=int(result_3)
result_4=a_text.find("NOTICE")
result_4=int(result_4)
if (result>=0 or result_1>=0) and result_2>=0:
print("Fails: Under Investigation")
return True
elif result>=0 :
print("Fails: Under Investigation")
return True
elif result_3>=0 and result_2>=0 :
print("Fails: Under Investigation")
elif result_1>=0 and result_4>=0 :
print("Fails: Under Investigation")
elif result_1>=0 and result_2>=0 :
print("Fails: Under Investigation")
elif result==-1 and result_1==-1 and result_2==-1:
pass
if i == n-1:
break
except AttributeError:
return True
td_text=td_text.split("-",2)
#print("A",td_text)
month=td_text[0]
day=td_text[1]
year=td_text[2]
months={'Jan': 1, 'Feb':2, 'Mar': 3, 'Apr':4,'May':5, 'Jun': 6, 'Jul':7,'Aug':8,'Sep':9,'Oct':10, 'Nov':11,'Dec':12}
month=months.pop(month)
#print("MOnth",month)
#print("Year",year)
#print("Day",day)
hour=year[3:5]
#print("Hour data",hour)
meridian=year[-1:-2]
#print("Meridian",meridian)
year=year[0:2]
year=str(year)
day=str(day)
month=str(month)
year="20"+year
# date we are stripping from web
t=year+'-'+month+'-'+day+'-'+hour
#print(t)
t_hour = datetime.strptime(hour, '%H')
t = datetime.strptime(t, '%Y-%m-%d-%H')
#print(t)
# todays date in datetime object
today =datetime.today().strftime('%Y-%m-%d-%H')
today = datetime.strptime(today, '%Y-%m-%d-%H')
hr_margin=timedelta(hours= 2)
margin = timedelta(days = 1)
#print( "Earnings date:{} ".format(t))
#print("Today:",today)
diff_minus=today - margin
#print(diff_minus)
diff_plus=today + margin
diff_hr_plus= today +hr_margin
diff_hr_minus= today-hr_margin
#print(diff_plus)
#t_hour=t_hour[10::]
if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
return True
def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
if diff_plus<= t <= diff_plus:
print("Day is good")
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print('Stripped Datetime {}'.format(t))
#print("Measured time hr",t_hour)
if diff_hr_minus<=t_hour <=diff_hr_plus:
print("Hour is good")
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print('Stripped Datetime {}'.format(t))
print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("\n")
return False
else:
print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print("\n")
return True
else:
print("News is NOT up to date by Day!! {} ".format(t))
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print("\n")
return True
看看我如何通过几个步骤将您给定的代码简化为 minimal reproducible example:
- 删除 web-scraping(因为对问题而言不是必需的)
- 去掉所有不解释的评论
- 删除对结构没有帮助的空行
- (可选)添加显示问题的测试(例如 function-call)
已解决问题
由于您声称日期时间比较存在问题,我发现了一个不一致之处:
在日比较中你有:
if diff_plus<= t <= diff_plus:
小时比较你有:
if diff_hr_minus<=t_hour <=diff_hr_plus:
需要解决的问题:
最小可重现示例
from datetime import timedelta
from datetime import datetime
def compared_date_from_td(td_text):
"""Returns datetime for input of format 'Jan-24-22 05:48PM'."""
# date from web
t = datetime.strptime(td_text.strip(), '%b-%d-%y %H:%M%p')
print("Earnings date: {} ".format(t))
# todays date in datetime object
today = datetime.today()
print("Today: {}".format(today))
margin = timedelta(days = 1)
diff_minus = today - margin
diff_plus = today + margin
t_hour = t.hour # get the hour part of datetime t
diff_hr_plus = today.hour + 2
diff_hr_minus = today.hour - 2
if date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
return True
def date_comp(t,diff_plus,diff_minus,diff_hr_plus,diff_hr_minus,t_hour,today):
if diff_minus <= t <= diff_plus: # issue fixed! was: diff_plus<= t <= diff_plus
print("Day is good")
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print('Stripped Datetime {}'.format(t))
#print("Measured time hr",t_hour)
if diff_hr_minus <= t_hour <= diff_hr_plus:
print("Hour is good")
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print('Stripped Datetime {}'.format(t))
print("News is up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("\n")
return False
else:
print("News is NOT up to date by Hour!! Time :{0} Story Hit: {1}".format(today,t))
print("Max allowed Hr {}".format(diff_hr_plus))
print("Min allowed Hr {}".format(diff_hr_minus))
print("\n")
return True
else:
print("News is NOT up to date by Day!! {} ".format(t))
print("Max allowed date {}".format(diff_plus))
print("Min allowed date {}".format(diff_minus))
print("\n")
return True
date_text = 'Jan-24-22 05:48PM'
not_uptodate = compared_date_from_td(date_text)
print("date: {}, compared as not_uptodate => {}".format(date_text, not_uptodate))
这输出:
Earnings date: 2022-01-24 05:48:00
Today: 2022-01-25 00:54:16.122160
Day is good
Max allowed date 2022-01-26 00:54:16.122160
Min allowed date 2022-01-24 00:54:16.122160
Stripped Datetime 2022-01-24 05:48:00
News is NOT up to date by Hour!! Time :2022-01-25 00:54:16.122160 Story Hit: 2022-01-24 05:48:00
Max allowed Hr 2
Min allowed Hr -2
date: Jan-24-22 05:48PM, compared as not_uptodate => True
注意:某些格式需要调整才能有意义,例如 Max allowed Hr 2
等。还有比较 returns True
if outside margins as "NOT up to date" (就像在上面的例子中,小时相差超过 2 小时)。
正确的解决方案
我的目标是否正确:测试日期时间是否在最近 2 小时内。
(1) 给定一段从网络上抓取的文本,您将确定格式并将其解析为日期时间。
(2) 给定日期时间,您将计算到现在的时间增量。然后测试这个是否小于2小时:
from datetime import datetime, timedelta
scrapedText = 'Jan-24-22 05:48PM'
newsTime = datetime.strptime(scrapedText, '%b-%d-%y %H:%M%p') # (1) parse datetime
if datetime.now() - newsTime < timedelta(hours=2): # (2) whithin last 2 hours
# news is less than 2 hours ago
另请参阅: How to find information from the last 24 hours