LogisticsRegression-checkpoint.ipynb 35.8 KB
In [1]:
import matplotlib
matplotlib.rcParams['font.sans-serif']=[u'simHei']
matplotlib.rcParams['axes.unicode_minus']=False
In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
In [22]:
df = pd.read_csv('./SMSSpamCollection.csv',header=None)
print(df.head)

print("垃圾邮件个数:%s" % df[df[0]=='spam'][0].count())
print("正常邮件个数:%s" % df[df[0]=='ham'][0].count())
Out [22]:
<bound method NDFrame.head of          0                                                  1
0      ham  Go until jurong point crazy.. Available only i...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf he lives arou...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20000 pou...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DATE ON SUNDAY WITH WILL!!
15    spam  XXXMobileMovieClub: To use your credit click t...
16     ham                         Oh k...i'm watching here:)
17     ham  Eh u remember how 2 spell his name... Yes i di...
18     ham  Fine if that?s the way u feel. That?s the way ...
19    spam  England v Macedonia - dont miss the goals/team...
20     ham          Is that seriously how you spell his name?
21     ham    I‘m going to try for 2 months ha ha only joking
22     ham  So ü pay first lar... Then when is da stock co...
23     ham  Aft i finish my lunch then i go str down lor. ...
24     ham  Ffffffffff. Alright no way I can meet up with ...
25     ham  Just forced myself to eat a slice. I'm really ...
26     ham                     Lol your always so convincing.
27     ham  Did you catch the bus ? Are you frying an egg ...
28     ham  I'm back &amp; we're packing the car now I'll ...
29     ham  Ahhh. Work. I vaguely remember that! What does...
...    ...                                                ...
5542   ham           Armand says get your ass over to epsilon
5543   ham             U still havent got urself a jacket ah?
5544   ham  I'm taking derek &amp; taylor to walmart if I'...
5545   ham      Hi its in durban are you still on this number
5546   ham         Ic. There are a lotta childporn cars then.
5547  spam  Had your contract mobile 11 Mnths? Latest Moto...
5548   ham                  No I was trying it all weekend ;V
5549   ham  You know wot people wear. T shirts jumpers hat...
5550   ham         Cool what time you think you can get here?
5551   ham  Wen did you get so spiritual and deep. That's ...
5552   ham  Have a safe trip to Nigeria. Wish you happines...
5553   ham                        Hahaha..use your brain dear
5554   ham  Well keep in mind I've only got enough gas for...
5555   ham  Yeh. Indians was nice. Tho it did kane me off ...
5556   ham  Yes i have. So that's why u texted. Pshew...mi...
5557   ham  No. I meant the calculation is the same. That ...
5558   ham                              Sorry I'll call later
5559   ham  if you aren't here in the next  &lt;#&gt;  hou...
5560   ham                  Anything lor. Juz both of us lor.
5561   ham  Get me out of this dump heap. My mom decided t...
5562   ham  Ok lor... Sony ericsson salesman... I ask shuh...
5563   ham                                Ard 6 like dat lor.
5564   ham  Why don't you wait 'til at least wednesday to ...
5565   ham                                       Huh y lei...
5566  spam  REMINDER FROM O2: To get 2.50 pounds free call...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity * was in mood for that. So...any other su...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>
垃圾邮件个数:747
正常邮件个数:4825
In [23]:
# In[1]
X = df[1].values.astype('U')
y = df[0].values.astype('U')
X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
Out [23]:
['Its normally hot mail. Com you see!'
 "Love isn't a decision it's a feeling. If we could decide who to love then life would be much simpler but then less magical"
 'when you and derek done with class?' ...
 'Indians r poor but India is not a poor country. Says one of the swiss bank directors. He says that " &lt;#&gt;  lac crore" of Indian money is deposited in swiss banks which can be used for \'taxless\' budget for  &lt;#&gt;  yrs. Can give  &lt;#&gt;  crore jobs to all Indians. From any village to Delhi 4 lane roads. Forever free power suply to more than  &lt;#&gt;  social projects. Every citizen can get monthly  &lt;#&gt; /- for  &lt;#&gt;  yrs. No need of World Bank &amp; IMF loan. Think how our money is blocked by rich politicians. We have full rights against corrupt politicians. Itna forward karo ki pura INDIA padhe.g.m."'
 "Annoying isn't it."
 'U meet other fren dun wan meet me ah... Muz b a guy rite...']
In [24]:
LR = LogisticRegression()
LR.fit(X_train,y_train)
predictions = LR.predict(X_test)
for i,prediction in enumerate(predictions[:5]):
    print("预测为 %s ,信件为 %s" % (prediction,X_test_raw[i]))
Out [24]:
预测为 ham ,信件为 Wat time ü wan today?
预测为 ham ,信件为 Hi.:)technical support.providing assistance to us customer through call and email:)
预测为 ham ,信件为 Are there TA jobs available? Let me know please cos i really need to start working
预测为 ham ,信件为 Heehee that was so funny tho
预测为 ham ,信件为 Guess who spent all last night phasing in and out of the fourth dimension
In [25]:
# In[2]二元分类分类指标
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
# predictions 与 y_test
confusion_matrix = confusion_matrix(y_test,predictions)
print(confusion_matrix)
plt.matshow(confusion_matrix)
plt.title("混淆矩阵")
plt.colorbar()
plt.ylabel("真实值")
plt.xlabel("预测值")
plt.show()
Out [25]:
[[1205    0]
 [  37  151]]
In [26]:
# In[3] 给出 precision    recall  f1-score   support
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

from sklearn.metrics import roc_curve,auc
# 准确率
scores =  cross_val_score(LR,X_train,y_train,cv=5)
print("准确率为: ",scores)
print("平均准确率为: ",np.mean(scores))

# 必须要将标签转为数值
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y_train_n = class_le.fit_transform(y_train)
y_test_n = class_le.fit_transform(y_test)
Out [26]:
             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1205
       spam       1.00      0.80      0.89       188

avg / total       0.97      0.97      0.97      1393

准确率为:  [0.94856459 0.94976077 0.95454545 0.96052632 0.95209581]
平均准确率为:  0.9530985875139673
In [27]:
# 精准率
precision =  cross_val_score(LR,X_train,y_train_n,cv=5,scoring='precision')
print("平均精准率为: ",np.mean(precision))
# 召回率
recall =  cross_val_score(LR,X_train,y_train_n,cv=5,scoring='recall')
print("平均召回率为: ",np.mean(recall))   
# F1值
f1 =  cross_val_score(LR,X_train,y_train_n,cv=5,scoring='f1')
print("平均F1值为: ",np.mean(f1))  
Out [27]:
平均精准率为:  0.989738656405323
平均召回率为:  0.656547619047619
平均F1值为:  0.7887220439566227
In [28]:
# In[4] ROC曲线 y_test_n为数值
predictions_pro = LR.predict_proba(X_test)
false_positive_rate, recall, thresholds = roc_curve(y_test_n,predictions_pro[:,1])
roc_auc = auc(false_positive_rate, recall)
plt.title("受试者操作特征曲线(ROC)")
plt.plot(false_positive_rate, recall, 'b', label='AUC = % 0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('假阳性率')
plt.ylabel('召回率')
plt.show()