将具有向量值的 pandas 数据框列转换为张量
Convert pandas data frame column, which has values of vectors, into tensors
我的问题是如何将 pandas 数据帧上的向量转换为张量。
数据框有一个简历列,其中包含每个简历文档的矢量表示。我需要将数据集的这一列转换为张量。
代码在下面。
简历列有一个数字列表,或者我们可以说是向量,数据框的类别列有标量值。
我试着用这种方式转换成张量:
tf.convert_to_tensor(output[["Resume"]])
我尝试过的其他方法是
numeric_dict_ds = tf.data.Dataset.from_tensor_slices((dict(output[["Resume"]]), output[["Category"]]))
最后一种方法是
numeric_dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))
但是其中 None 个在工作
import os
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization # to create AdamW optimizer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
tf.get_logger().setLevel('ERROR')
warnings.filterwarnings('ignore')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english')+['``',"''"])
def clean_resume_text(resume):
resume = resume.lower()
resume = re.sub('http\S+\s*',' ',resume) #to remove url
resume = ''.join([w for w in resume if not w.isdigit()]) # remove the digits
resume = re.sub('RT|cc',' ',resume) # to remove RT and cc
resume = re.sub('#\S+','',resume) # to remove hastags
resume = re.sub('@\S+',' ',resume) # to remove mentions
resume = ''.join([w for w in resume if w not in string.punctuation])# to remove puntuations
resume = re.sub('\W',' ',resume)
#resume = ''.join([w for w in resume if w not in stopwords_set])
resume = re.sub(r'[^\x00-\x7f]',r' ',resume)
resume = re.sub('\s+',' ',resume)# to remove extra spaces
return resume
resume_df['Resume']=resume_df.Resume.apply(lambda x: clean_resume_text(x))
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=True)
def get_sentence_embeding(sentences):
preprocessed_text = bert_preprocess(sentences)
return bert_encoder(preprocessed_text)['pooled_output']
resume_df["Resume"]=resume_df.Resume.apply(lambda x: get_sentence_embeding([x]))
#Save the the vectorized dataframe
resume_df.to_pickle("resume_Embedding.pkl")
output = pd.read_pickle("resume_Embedding.pkl")
encoder=LabelEncoder()
output["Category"]=encoder.fit_transform(output["Category"])
output=tf.convert_to_tensor(output[["Resume","Category"]])
model=tf.keras.Sequential([
keras.Input(shape=output.shape),
keras.layers.Dense(output.shape[0],activation='relu'),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dense(25,activation='softmax')
])
model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
import tensorflow as tf
resume = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]
tf.convert_to_tensor(resume, dtype=tf.float32)
输出
<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[1., 2.],
[3., 4.],
[5., 6.]], dtype=float32)>
看看这个link
我的问题是如何将 pandas 数据帧上的向量转换为张量。
数据框有一个简历列,其中包含每个简历文档的矢量表示。我需要将数据集的这一列转换为张量。
代码在下面。
简历列有一个数字列表,或者我们可以说是向量,数据框的类别列有标量值。 我试着用这种方式转换成张量:
tf.convert_to_tensor(output[["Resume"]])
我尝试过的其他方法是
numeric_dict_ds = tf.data.Dataset.from_tensor_slices((dict(output[["Resume"]]), output[["Category"]]))
最后一种方法是
numeric_dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))
但是其中 None 个在工作
import os
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization # to create AdamW optimizer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
tf.get_logger().setLevel('ERROR')
warnings.filterwarnings('ignore')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english')+['``',"''"])
def clean_resume_text(resume):
resume = resume.lower()
resume = re.sub('http\S+\s*',' ',resume) #to remove url
resume = ''.join([w for w in resume if not w.isdigit()]) # remove the digits
resume = re.sub('RT|cc',' ',resume) # to remove RT and cc
resume = re.sub('#\S+','',resume) # to remove hastags
resume = re.sub('@\S+',' ',resume) # to remove mentions
resume = ''.join([w for w in resume if w not in string.punctuation])# to remove puntuations
resume = re.sub('\W',' ',resume)
#resume = ''.join([w for w in resume if w not in stopwords_set])
resume = re.sub(r'[^\x00-\x7f]',r' ',resume)
resume = re.sub('\s+',' ',resume)# to remove extra spaces
return resume
resume_df['Resume']=resume_df.Resume.apply(lambda x: clean_resume_text(x))
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=True)
def get_sentence_embeding(sentences):
preprocessed_text = bert_preprocess(sentences)
return bert_encoder(preprocessed_text)['pooled_output']
resume_df["Resume"]=resume_df.Resume.apply(lambda x: get_sentence_embeding([x]))
#Save the the vectorized dataframe
resume_df.to_pickle("resume_Embedding.pkl")
output = pd.read_pickle("resume_Embedding.pkl")
encoder=LabelEncoder()
output["Category"]=encoder.fit_transform(output["Category"])
output=tf.convert_to_tensor(output[["Resume","Category"]])
model=tf.keras.Sequential([
keras.Input(shape=output.shape),
keras.layers.Dense(output.shape[0],activation='relu'),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dense(25,activation='softmax')
])
model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
import tensorflow as tf
resume = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]
tf.convert_to_tensor(resume, dtype=tf.float32)
输出
<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[1., 2.],
[3., 4.],
[5., 6.]], dtype=float32)>
看看这个link