将 4 位数年份格式转换为 Python MM/DD/YYYY 日期时间格式
Converting 4-digit year format to Python MM/DD/YYYY date time format
我使用 regex .findall(' ') 函数从 pandas 数据框中的字符串中提取不同格式的日期时间元素到 'list' 对象,并将它们放入名为 "new." 但是,有些日期时间列表对象只有 4 位 YYYY 格式,缺少月 (MM) 和日 (DD)(例如 df['new'].iloc[99]),并且缺少日期 (DD) 的对象,例如 df['new'].iloc[221],如下所示:
new
0 [6/12/2009]
1 [12-10-2013]
2 [7/8/71]
3 [9-27-75]
4 [23rd May, 96]
5 [7/06/79]
...
99 [1968]
...
221 [8/2009]
...
470 [May 22nd, 2015]
注意:每个单元格都是一个列表对象。
因为我想在所有这些日期提取和格式化完成后按时间顺序对它们进行排序,所以为了方便起见,对于像 [1968] 这样的单元格值,我假设这是第一天年份(即 1968 年 1 月 1 日)和像 [8/2009](或 [08/2009])这样的单元格值,我假设它是该年月份的第一天(即 2009 年 8 月 1 日) .
所以我问有没有办法写一个简单的函数来将[YYYY]和[M/YYYY](或[MM/YYYY])格式全部转换成[MM/DD/YYYY]格式,如
[1968] to [01/01/1968]
[8/2009](or [08/2009]) to [08/01/2009]
对列 df['new'] 中缺少月份和日期信息的(可能)数百个列表对象 [] 执行此转换的最简单方法是什么?
[编辑]
我使用了以下代码(使用 Bandi A 提供的 change_format() 函数)
import pandas as pd
import re
import datetime as dt
#load txt file (no header)
doc = []
with open('dates.txt') as file:
for line in file:
doc.append(line)
df = pd.Series(doc)
#use regex findall() to extract datetime from df
df['new'] = df.str.findall(r'\b(\d{1,2}\/\d{1,2}\/\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*-\d{2}-\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}. )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2} )\d{4}|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2})[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|\d{1,2}\/\d{4}|\d{4}|\d{1,2}-\d{1,2}-\d{2,4})\b')
from datetime import date
def change_format(a):
c = re.split('[-/ ]',a[0])
b= len(c)
if b==1:
return date(int(c[0]),1,1).strftime('%d/%m/%Y')
elif b==2:
return date(int(c[1]),int(c[0]),1).strftime('%d/%m/%Y')
else:
return a[0]
df['modified_new'] = map(change_format,df['new'])
df['modified_new']
提取的日期时间输出(即 df['new'])如下所示(注意空单元格 [] 因为原始文本字符串包含不规则格式)
0 [03/25/93]
1 [6/18/85]
2 [7/8/71]
3 [9/27/75]
4 [2/6/96]
5 [7/06/79]
6 [5/18/78]
7 [10/24/89]
8 [3/7/86]
9 [4/10/71]
10 [5/11/85]
...
490 [2007]
491 [2009]
492 [1986]
493 []
494 [2002]
495 [1979]
496 [2006]
497 [2008]
498 [2005]
499 [1980]
import pandas as pd
import re
from datetime import datetime
def helper(a ,f):
return datetime.strptime(a,f).strftime('%m-%d-%Y')
def change_format(a):
#print a
if 'Janaury' in a:
a = a[:3]+a[7:]
if 'Decemeber' in a:
a = a[:3]+a[9:]
c = re.split('/|-| ',a)
b = len(c)
if re.match(r'\d\d [A-Z]',a) != None:
if len(c[1]) == 3:
return helper(a,'%d %b %Y')
else:
return helper(a, '%d %B %Y')
elif re.match(r'[A-Z]',a) != None:
if len(c) == 2:
if len(c[0]) == 3:
return helper(a+' 1','%b %Y %d')
else:
return helper(a+' 1','%B %Y %d')
if len(c[0]) == 3:
if ',' in a:
return helper(a,'%b %d, %Y')
else:
return helper(a,'%b %d %Y')
else:
if ',' in a:
return helper(a,'%B %d, %Y')
else:
return helper(a,'%B %d %Y')
else:
if b==3:
if len(c[2]) == 2:
if '-' in a:
return helper(a,'%m-%d-%y')
else:
return helper(a ,'%m/%d/%y')
elif len(c[2]) == 4:
return date(int(c[2]),int(c[0]),int(c[1])).strftime('%m-%d-%Y')
elif b==2:
return date(int(c[1]),int(c[0]),1).strftime('%m-%d-%Y')
else:
return date(int(c[0]),1,1).strftime('%m-%d-%Y')
with open('dates.txt') as f:
d = f.read()
f.close()
k = re.findall(r'\b(\d{1,2}\/\d{1,2}\/\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*-\d{2}-\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}. )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2} )\d{4}|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2})[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|\d{1,2}\/\d{4}|\d{4}|\d{1,2}-\d{1,2}-\d{2,4})\b',d)
k.remove('7787')
dates =map(change_format,k)
dates.remove(None)
df = pd.DataFrame(dates,columns= ['date'])
df['date'] =pd.to_datetime(df.date)
df = df.sort_values('date').reset_index(drop=True)
我使用 regex .findall(' ') 函数从 pandas 数据框中的字符串中提取不同格式的日期时间元素到 'list' 对象,并将它们放入名为 "new." 但是,有些日期时间列表对象只有 4 位 YYYY 格式,缺少月 (MM) 和日 (DD)(例如 df['new'].iloc[99]),并且缺少日期 (DD) 的对象,例如 df['new'].iloc[221],如下所示:
new
0 [6/12/2009]
1 [12-10-2013]
2 [7/8/71]
3 [9-27-75]
4 [23rd May, 96]
5 [7/06/79]
...
99 [1968]
...
221 [8/2009]
...
470 [May 22nd, 2015]
注意:每个单元格都是一个列表对象。
因为我想在所有这些日期提取和格式化完成后按时间顺序对它们进行排序,所以为了方便起见,对于像 [1968] 这样的单元格值,我假设这是第一天年份(即 1968 年 1 月 1 日)和像 [8/2009](或 [08/2009])这样的单元格值,我假设它是该年月份的第一天(即 2009 年 8 月 1 日) .
所以我问有没有办法写一个简单的函数来将[YYYY]和[M/YYYY](或[MM/YYYY])格式全部转换成[MM/DD/YYYY]格式,如
[1968] to [01/01/1968]
[8/2009](or [08/2009]) to [08/01/2009]
对列 df['new'] 中缺少月份和日期信息的(可能)数百个列表对象 [] 执行此转换的最简单方法是什么?
[编辑] 我使用了以下代码(使用 Bandi A 提供的 change_format() 函数)
import pandas as pd
import re
import datetime as dt
#load txt file (no header)
doc = []
with open('dates.txt') as file:
for line in file:
doc.append(line)
df = pd.Series(doc)
#use regex findall() to extract datetime from df
df['new'] = df.str.findall(r'\b(\d{1,2}\/\d{1,2}\/\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*-\d{2}-\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}. )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2} )\d{4}|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2})[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|\d{1,2}\/\d{4}|\d{4}|\d{1,2}-\d{1,2}-\d{2,4})\b')
from datetime import date
def change_format(a):
c = re.split('[-/ ]',a[0])
b= len(c)
if b==1:
return date(int(c[0]),1,1).strftime('%d/%m/%Y')
elif b==2:
return date(int(c[1]),int(c[0]),1).strftime('%d/%m/%Y')
else:
return a[0]
df['modified_new'] = map(change_format,df['new'])
df['modified_new']
提取的日期时间输出(即 df['new'])如下所示(注意空单元格 [] 因为原始文本字符串包含不规则格式)
0 [03/25/93]
1 [6/18/85]
2 [7/8/71]
3 [9/27/75]
4 [2/6/96]
5 [7/06/79]
6 [5/18/78]
7 [10/24/89]
8 [3/7/86]
9 [4/10/71]
10 [5/11/85]
...
490 [2007]
491 [2009]
492 [1986]
493 []
494 [2002]
495 [1979]
496 [2006]
497 [2008]
498 [2005]
499 [1980]
import pandas as pd
import re
from datetime import datetime
def helper(a ,f):
return datetime.strptime(a,f).strftime('%m-%d-%Y')
def change_format(a):
#print a
if 'Janaury' in a:
a = a[:3]+a[7:]
if 'Decemeber' in a:
a = a[:3]+a[9:]
c = re.split('/|-| ',a)
b = len(c)
if re.match(r'\d\d [A-Z]',a) != None:
if len(c[1]) == 3:
return helper(a,'%d %b %Y')
else:
return helper(a, '%d %B %Y')
elif re.match(r'[A-Z]',a) != None:
if len(c) == 2:
if len(c[0]) == 3:
return helper(a+' 1','%b %Y %d')
else:
return helper(a+' 1','%B %Y %d')
if len(c[0]) == 3:
if ',' in a:
return helper(a,'%b %d, %Y')
else:
return helper(a,'%b %d %Y')
else:
if ',' in a:
return helper(a,'%B %d, %Y')
else:
return helper(a,'%B %d %Y')
else:
if b==3:
if len(c[2]) == 2:
if '-' in a:
return helper(a,'%m-%d-%y')
else:
return helper(a ,'%m/%d/%y')
elif len(c[2]) == 4:
return date(int(c[2]),int(c[0]),int(c[1])).strftime('%m-%d-%Y')
elif b==2:
return date(int(c[1]),int(c[0]),1).strftime('%m-%d-%Y')
else:
return date(int(c[0]),1,1).strftime('%m-%d-%Y')
with open('dates.txt') as f:
d = f.read()
f.close()
k = re.findall(r'\b(\d{1,2}\/\d{1,2}\/\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*-\d{2}-\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}. )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )\d{4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2} )\d{4}|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z.]* (?:\d{4})|(?:\d{2} )(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2})[a-z,]* (?:\d{4})|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{4})|\d{1,2}\/\d{4}|\d{4}|\d{1,2}-\d{1,2}-\d{2,4})\b',d)
k.remove('7787')
dates =map(change_format,k)
dates.remove(None)
df = pd.DataFrame(dates,columns= ['date'])
df['date'] =pd.to_datetime(df.date)
df = df.sort_values('date').reset_index(drop=True)