Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
H
homework
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ligang
homework
Commits
89211377
Commit
89211377
authored
Mar 07, 2019
by
ligang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
'z抓取京东信息'
parent
e91d8789
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
179 additions
and
0 deletions
+179
-0
18-homework-ligang/crawl_jd.py
+179
-0
No files found.
18-homework-ligang/crawl_jd.py
0 → 100644
View file @
89211377
#!/usr/local/env python3
#-*-coding:utf-8 -*-
from
selenium
import
webdriver
import
time
import
os
import
json
# 鼠标事件依赖于ActionChains对象
from
selenium.webdriver.common.action_chains
import
ActionChains
from
selenium.webdriver.common.keys
import
Keys
path
=
"C:
\
Program Files (x86)
\
Google
\
Chrome
\
Application
\
chromedriver.exe"
target
=
"https://www.jd.com"
#验证登录的正确性
test_order
=
"https://order.jd.com/center/list.action"
#定义登录的用户名与密码
uname
=
"16601258428"
pwd
=
"123456qwert"
driver
=
webdriver
.
Chrome
(
path
)
def
get_element_dict
(
info_element
):
#拿到第一列的信息
computer_part
=
info_element
.
find_element_by_tag_name
(
"h3"
)
#计算机信息中的key值
computer_info_keys
=
info_element
.
find_elements_by_tag_name
(
"dt"
)
#计算机信息中的values值
computer_info_values
=
info_element
.
find_elements_by_xpath
(
"dl//dd[not(contains(@class,'Ptable-tips'))]"
)
#存储计算机信息中的Key 和value
#最终数据的样子{"主题":{"主板":"zzz",....},"":{}}
key_and_value_dict
=
{}
#用来存储所有的计算机组成信息的字典
parts_dict
=
{}
for
i
in
range
(
len
(
computer_info_keys
)):
key_and_value_dict
[
computer_info_keys
[
i
]
.
text
]
=
computer_info_values
[
i
]
.
text
parts_dict
[
computer_part
.
text
]
=
key_and_value_dict
return
parts_dict
def
save_goods_info
(
info_list
):
project_path
=
os
.
path
.
dirname
(
os
.
getcwd
())
file_path
=
project_path
+
'/goods_info/'
if
not
os
.
path
.
exists
(
file_path
):
os
.
mkdir
(
file_path
)
with
open
(
file_path
+
"computer.infos"
,
"a"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
str
(
info_list
))
def
get_notebook_info
():
driver
.
get
(
target
)
elem
=
driver
.
find_element_by_link_text
(
"电脑"
)
# 鼠标悬停
ActionChains
(
driver
)
.
move_to_element
(
elem
)
.
perform
()
time
.
sleep
(
2
)
# 笔记本
driver
.
find_element_by_link_text
(
"笔记本"
)
.
click
()
time
.
sleep
(
2
)
#切换句柄
handles
=
driver
.
window_handles
#所有窗口句柄
index_handle
=
driver
.
current_window_handle
#当前窗口句柄
for
handle
in
handles
:
#循环切换至新打开窗口
if
handle
!=
index_handle
:
driver
.
switch_to
.
window
(
handle
)
# 点击thinkpad
driver
.
find_element_by_xpath
(
".//*[ @id =
\"
brand-11518
\"
]/a/img"
)
.
click
()
time
.
sleep
(
3
)
#点击7000以上
selector
=
driver
.
find_element_by_xpath
(
"//*[@id=
\"
J_selectorPrice
\"
]/div/div[2]/div/ul/li[7]/a"
)
.
click
()
time
.
sleep
(
2
)
#点击评论数
driver
.
find_element_by_xpath
(
"//*[@id=
\"
J_filter
\"
]/div[1]/div[1]/a[3]"
)
.
click
()
time
.
sleep
(
2
)
# 点击评论数最多的那个--第一款电脑
driver
.
find_element_by_xpath
(
"//*[@id =
\"
plist
\"
]/ul/li[1]/div/div[1]/a/img"
)
.
click
()
time
.
sleep
(
3
)
#切换句柄
notebook_handle
=
driver
.
current_window_handle
handles
=
driver
.
window_handles
#必须要重新获取一下handles 因为这里有三个窗口了
for
handle
in
handles
:
# 获取的窗口不等于当前窗口并且不等于上一次打开的窗口
if
handle
!=
notebook_handle
and
handle
!=
index_handle
:
driver
.
switch_to
.
window
(
handle
)
#滑动滚轮
js
=
"window.scrollTo(0,1500)"
driver
.
execute_script
(
js
)
#点击规则与参数
driver
.
find_element_by_xpath
(
"//*[@id=
\"
detail
\"
]/div[1]/ul/li[2]"
)
.
click
()
info_elements
=
driver
.
find_elements_by_class_name
(
"Ptable-item"
)
#解析所有的标签
result_list
=
[]
for
info_element
in
info_elements
:
#获取到每一行的笔记本的信息
info_element_dict
=
get_element_dict
(
info_element
)
result_list
.
append
(
info_element_dict
)
#保存这些信息到文件中
save_goods_info
(
result_list
)
#保存cookies信息到driver中
def
save_cookies_to_driver
():
cookies_file
=
get_cookies_file
()
if
not
os
.
path
.
exists
(
cookies_file
):
#判断如果此文件夹中不存在,则直接跳转到登 录页面
login
()
jd_cookies_file
=
open
(
cookies_file
,
"r"
)
#if os.path.getsize(jd_cookies_file)==0:
jd_cookies_str
=
jd_cookies_file
.
readline
()
if
len
(
jd_cookies_str
)
==
0
:
#判断如果此文件夹中的文件内容为空,则直接跳转到登录页面读取cookies
login
()
jd_cookies_dict
=
json
.
loads
(
jd_cookies_str
)
#这里必须清除掉旧的cookies
driver
.
get
(
target
)
driver
.
delete_all_cookies
()
for
cookie
in
jd_cookies_dict
:
driver
.
add_cookie
(
cookie
)
return
driver
def
check_cookies
():
#设置一个登录状态,初始值是未登录
login_status
=
False
#将cookies信息保存到driver中
driver
=
save_cookies_to_driver
()
#进行跳转链接的检测
driver
.
get
(
test_order
)
current_url
=
driver
.
current_url
if
current_url
==
test_order
:
login_status
=
True
#print("登录成功")
return
login_status
else
:
#print("登录失败")
return
login_status
#获取cookies存放路径
def
get_cookies_dir
():
project_path
=
os
.
path
.
dirname
(
os
.
getcwd
())
file_path
=
project_path
+
'/cookies/'
if
not
os
.
path
.
exists
(
file_path
):
os
.
mkdir
(
file_path
)
return
file_path
def
get_cookies_file
():
return
get_cookies_dir
()
+
"jd.cookies"
def
save_cookies_to_file
(
driver
):
file_path
=
get_cookies_dir
()
#保存cookies到文件 中
cookies
=
driver
.
get_cookies
()
with
open
(
file_path
+
"jd.cookies"
,
"w"
)
as
c
:
#这里必须用dump方式写入文件
#不然你loads的时候会有问题
#格式不匹配
json
.
dump
(
cookies
,
c
)
def
login
():
driver
.
get
(
target
)
driver
.
maximize_window
()
time
.
sleep
(
3
)
driver
.
find_element_by_class_name
(
"link-login"
)
.
click
()
driver
.
find_element_by_link_text
(
"账户登录"
)
.
click
()
search_key
=
driver
.
find_element_by_id
(
"loginname"
)
search_key
.
clear
()
# 清空搜索框中的内容
search_key
.
send_keys
(
uname
)
driver
.
find_element_by_id
(
"nloginpwd"
)
.
send_keys
(
pwd
)
driver
.
find_element_by_id
(
"loginsubmit"
)
.
click
()
time
.
sleep
(
5
)
save_cookies_to_file
(
driver
)
if
__name__
==
"__main__"
:
try
:
loop_status
=
True
while
loop_status
:
#检验cookies是否生效
login_status
=
check_cookies
()
if
login_status
:
loop_status
=
False
else
:
login
()
time
.
sleep
(
2
)
get_notebook_info
()
finally
:
time
.
sleep
(
5
)
driver
.
quit
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment