本文共 2166 字,大约阅读时间需要 7 分钟。
很久之前写着玩的,最近拿出来还能用,等过段时间优化下
#打开html#获取所有指定的标签中的href#遍历打开所有的href中的url#获取指定的元素#获取图片的url链接#通过函数,将图片保存到本地import timeimport osimport reimport urllib.requestimport uuidfrom selenium import webdriverfrom selenium.webdriver.support.ui import WebDriverWait#生成一个文件名字符串def generateFileName(): return str(uuid.uuid1())#根据文件名创建文件def createFileWithFileName(localPathParam,fileName): totalPath=localPathParam+'\\'+fileName if not os.path.exists(totalPath): file=open(totalPath,'a+') file.close() return totalPathdef getAndSaveImg(imgUrl,img_name): if (len(imgUrl) != 0): fileName = img_name + '.jpg' fileName = re.sub('[\/:*?"<>|]', '-', fileName) try: urllib.request.urlretrieve(imgUrl, createFileWithFileName("C:\\Downloads", fileName)) except: print("这图我没法下载")#获取每个list的urldef get_list(): lists = driver.find_elements_by_class_name("list-group-item") for i in range(len(lists)): list = lists[i].get_attribute("href") print(list) # 存入list中 list_info.append(list)if __name__=="__main__": driver=webdriver.PhantomJS() driver.set_window_size(1400, 900) for m in range(28,50): list_info = [] url="http://www.doutula.com/article/list?page="+str(m+1) driver.get(url) #网页加载完成后,等待2s wait1 = WebDriverWait(driver, 2) # 获取每个list的url,返回结果存入了list_info get_list() #遍历每个url链接,打开 for j in range(len(list_info)): driver.get(list_info[j]) wait2 = WebDriverWait(driver, 2) #page=driver.page_source #print(page) url_info=driver.find_elements_by_xpath("//div[@class='artile_des']/table/tbody") for x in range (len(url_info)): img_url=url_info[x].find_element_by_tag_name("img").get_attribute("src") img_name = url_info[x].find_element_by_tag_name("img").get_attribute("alt") print("坐标" + str(m+1)+":"+str(j)+":"+str(x)) print(img_url+"----->"+img_name) getAndSaveImg(img_url,img_name) #print('第'+str(j+1)+"行")
转载于:https://www.cnblogs.com/lm1107/p/7001343.html