|
先上图
代码
- import time
- import re
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- import numpy as np
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.support.select import Select
- #driver=webdriver.Chrome()
- url_list=[]
- for i in range(1,1742):
- print(i)
- url='https://www.bxd365.com/agent/0-0-0/'+str(i)+'.html'
- #url='https://www.bxd365.com/agent/0-0-0/2.html'
- try:
- page=requests.get(url,timeout=15)
- except:
- time.sleep(5)
- page=requests.get(url,timeout=15)
- page.encoding='utf-8'
- page=page.text
- p_url=re.compile(r"""class="name">\r\n\t\t\t\t\t\t\t<a href="([\s\S]*?)" target="_blank">""")
- url=p_url.findall(page)
- url_list.extend(url)
- len(url_list)
- count=0
- content_dict={}
- for url in url_list:
- #url='https://bxd574268973.bxd365.com/'
- try:
- page=requests.get(url,timeout=15)
- except:
- time.sleep(5)
- page=requests.get(url,timeout=15)
- page.encoding='utf-8'
- page=page.text
- page
- p_content=re.compile(r"""个性签名:</span>\r\n\t\t\t\t\t\t<a class="f14co2 cu">\r\n\t\t\t\t\t\t\t([\s\S]*?)\t\t\t\t\t\t</a>""")
- content=p_content.findall(page)
- if len(content)>0:
- content_dict[url]=content[0]
- print(url)
- count+=1
- print(count)
-
- len(content_dict)
- result=''
- count=0
- for i in content_dict.values():
- if i!='保险是晴天的一把伞,是汽车的安全带':
- result=result+i
- count=count+1
- print(count)
- result
复制代码
#词云图代码
- import matplotlib.pyplot as plt #数学绘图库
- import jieba #分词库
- from wordcloud import WordCloud #词云库
-
- #1、读入txt文本数据
- text = open(r'C:/Users/Administrator/Desktop/保险代理人个性标签.txt',"r").read()
-
- #2、结巴分词,默认精确模式。可以添加自定义词典userdict.txt,然后jieba.load_userdict(file_name) ,file_name为文件类对象或自定义词典的路径
- # 自定义词典格式和默认词库dict.txt一样,一个词占一行:每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒
-
- cut_text= jieba.cut(text)
- result= "/".join(cut_text)#必须给个符号分隔开分词结果来形成字符串,否则不能绘制词云
-
- #print(result)
- my_wordcloud = WordCloud(font_path='C:/Users/Administrator/Desktop/msyh.ttf').generate(result)
-
- plt.imshow(my_wordcloud)
- plt.axis("off")
- plt.show()
复制代码 |
|