python 删除标点符号垃圾邮件

python remove punctuation email spam

正在尝试从单词列表中删除标点符号。 python 编程新手,所以如果有人能提供帮助那就太好了。这样做的目的是用于电子邮件垃圾邮件分类。以前我在检查标点符号是否存在后加入了单词,但这给了我单个字符而不是整个单词。在更改它以获取单词后,这就是我下面的内容,所以现在尝试删除标点符号,因为它不会像我以前那样工作。

import os
import string
from collections import Counter
from os import listdir  # return all files and folders in the directory

import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# used for importing the lingspam dataset
def importLingspamDataset(dir):
    allEmails = [] # for storing the emails once read
    fileNames = []
    for file in listdir(dir):
        f = open((dir + '/' + file), "r")  # used for opening the file in read only format
        fileNames.append(file)
        allEmails.append(f.read()) # appends the read emails to the emails array
        f.close()
    return allEmails, fileNames

def importEnronDataset(dir):
    allEmails = []  # for storing the emails once read
    fileNames = []
    for file in listdir(dir):
        f = open((dir + '/' + file), "r")  # used for opening the file in read only format
        fileNames.append(file)
        allEmails.append(f.read())  # appends the read emails to the emails array
        f.close()
        return allEmails, fileNames

# used to remove punctuation from the emails as this is of no use for detecting spam
def removePunctuation(cleanedEmails):
    punc = set(string.punctuation)
    for word, line in enumerate(cleanedEmails):
        words = line.split()
        x = [''.join(c for c in words if c not in string.punctuation)]
        allWords = []
        allWords += x
        return allWords

# used to remove stopwords i.e. words of no use in detecting spam
def removeStopwords(cleanedEmails):
    removeWords = set(stopwords.words('english')) # sets all the stopwords to be removed
    for stopw in removeWords: # for each word in remove words
        if stopw not in removeWords: # if the word is not in the stopwords to be removed
            cleanedEmails.append(stopw) # add this word to the cleaned emails
    return(cleanedEmails)

# funtion to return words to its root form - allows simplicity
def lemmatizeEmails(cleanedEmails):
    lemma = WordNetLemmatizer() # to be used for returning each word to its root form
    lemmaEmails = [lemma.lemmatize(i) for i in cleanedEmails] # lemmatize each word in the cleaned emails
    return lemmaEmails

# function to allow a systematic process of elimating the undesired elements within the emails
def cleanAllEmails(cleanedEmails):
    cleanPunc = removePunctuation(cleanedEmails)
    cleanStop = removeStopwords(cleanPunc)
    cleanLemma = lemmatizeEmails(cleanStop)
    return cleanLemma

def createDictionary(email):
    allWords = []
    allWords.extend(email)
    dictionary = Counter(allWords)
    dictionary.most_common(3000)
    word_cloud = WordCloud(width=400, height=400, background_color='white',
              min_font_size=12).generate_from_frequencies(dictionary)
    plt.imshow(word_cloud)
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.show()
    word_cloud.to_file('test1.png')

def featureExtraction(email):
     emailFiles = []
     emailFiles.extend(email)
     featureMatrix = np.zeros((len(emailFiles), 3000))


def classifyLingspamDataset(email):
    classifications = []
    for name in email:
         classifications.append("spmsg" in name)
    return classifications

# Lingspam dataset
trainingDataLingspam, trainingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/train-mails") # extract the training emails from the dataset
#testingDataLingspam, testingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/test-mails") # extract the testing emails from the dataset

trainingDataLingspamClean = cleanAllEmails(trainingDataLingspam)
#testingDataLingspamClean = cleanAllEmails(testingDataLingspam)

#trainClassifyLingspam = classifyLingspamDataset(trainingDataLingspam)
#testClassifyLingspam = classifyLingspamDataset(testingDataLingspam)

trainDictionary = createDictionary(trainingDataLingspamClean)
#createDictionary(testingDataLingspamClean)

#trainingDataEnron, trainingEnronFilename = importEnronDataset("spam-non-spam-dataset-enron/bigEmailDump/training/")

根据您的问题,我假设您有一个电子邮件列表,您希望为每封电子邮件删除标点符号。此答案基于您发布的代码的第一次修订。

import string


def removePunctuation(emails):

    # I am using a list comprehension here to iterate over the emails.
    # For each iteration, translate the email to remove the punctuation marks.
    # Translate only allows a translation table as an argument.
    # This is why str.maketrans is used to create the translation table.

    cleaned_emails = [email.translate(str.maketrans('', '', string.punctuation))
                      for email in emails]

    return cleaned_emails


if __name__ == '__main__':

    # Assuming cleanedEmails is a list of emails, 
    # I am substituting cleanedEmails with emails.
    # I used cleanedEmails as the result.

    emails = ["This is a, test!", "This is another#@! \ntest"]
    cleaned_emails = removePunctuation(emails)
    print(cleaned_emails)
input: ["This is a, test!", "This is another#@! \ntest"]
output: ['This is a test', 'This is another \ntest']

编辑:

问题在与 OP 交谈后得到解决。 OP 在使用 WordCloud 时遇到问题,我提供的解决方案正在运行。设法通过让 WordCloud 工作来指导 OP。 OP 现在正在微调 WordCloud 的结果。