Browse Source

更新配置

Your Name 1 year ago
parent
commit
f7459ebd60
6 changed files with 106 additions and 71 deletions
  1. 1 1
      README.md
  2. 22 10
      gaode_api.py
  3. 40 37
      processor.py
  4. 7 7
      setting.py
  5. 34 16
      spider.py
  6. 2 0
      store.py

+ 1 - 1
README.md

@@ -4,6 +4,6 @@
 4.store.py --数据持久化
 5.log.py --日志配置
 6.setting.py --爬虫配置
-
+7.gaode_api.py --高德API
 
 pip install -r requirements.txt

+ 22 - 10
gaode_api.py

@@ -7,7 +7,10 @@ from log import PPLogger
 
 logger = PPLogger(name='gaode_api')
 logger.setup_logger()
+
+
 def get_zhoubian(location=None):
+    # print(location)
     data_dict = {'pipei': False, 'zhuzhai': [], 'xuexiao': [], 'xiezilou': [], 'jingpin': [], 'liansuo': []}
     if not location:
         return data_dict
@@ -18,16 +21,20 @@ def get_zhoubian(location=None):
         'keywords': keyword,
         'location': location,
         'radius': str(BANJING),
+        'region': '370200',
+        'city_limit': 'true',
         'show_fields': 'business',
-        'page_size': '50'
+        # 'page_size': '50'
     } for keyword in keywords]
     params = {
         'key': GAODE_KEY,
         'types': '080113|120201|120202|120203|120300|141201|141202|141206',
         'location': location,
         'radius': str(BANJING),
+        'region': '370200',
+        'city_limit': 'true',
         'show_fields': 'business',
-        'page_size': '50'
+        # 'page_size': '50'
     }
     results_liansuo = []
     with ThreadPoolExecutor() as pool:
@@ -43,26 +50,31 @@ def get_zhoubian(location=None):
         return data_dict
 
     for liansuo in results_liansuo:
-        data_dict['liansuo'].extend(liansuo['pois'])
+        try:
+            for pois in liansuo['pois']:
+                # print(pois)
+                data_dict['liansuo'].append({'name':pois['name'], 'address':pois['address'], 'distance':pois['distance']})
+        except Exception as e:
+            logger.error(e, liansuo)
     for qita in results_qita['pois']:
         if '写字楼' in qita['type']:
-            data_dict['xiezilou'].append(qita)
+            data_dict['xiezilou'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         elif '住宅' in qita['type']:
-            data_dict['zhuzhai'].append(qita)
+            data_dict['zhuzhai'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         elif '学校' in qita['type']:
-            data_dict['xuexiao'].append(qita)
+            data_dict['xuexiao'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         elif '台球' in qita['type']:
-            data_dict['jingpin'].append(qita)
+            data_dict['jingpin'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         else:
             pass
 
     if (len(data_dict['zhuzhai']) >= int(ZHUZHAI_COUNT) or len(data_dict['xuexiao']) >= int(XUEXIAO_COUNT) or len(
             data_dict['xiezilou']) >= int(XIEZILOU_COUNT)) \
-            and len(data_dict['jingpin']) <= int(LIANSUO_COUNT) \
-            and len(data_dict['liansuo']) >= int(JINGPIN_COUNT):
+            and len(data_dict['jingpin']) <= int(JINGPIN_COUNT) \
+            and len(data_dict['liansuo']) >= int(LIANSUO_COUNT):
         data_dict['pipei'] = True
 
     return data_dict
 
 
-# print(get_zhoubian('120.42716445444404,36.165840548830225'))
+# print(get_zhoubian('120.469038,36.398833'))

+ 40 - 37
processor.py

@@ -6,6 +6,8 @@ from store import WuBaStore
 from setting import STORE_METHOD
 from urllib import parse
 from gaode_api import get_zhoubian
+from threading import current_thread
+
 
 class WuBaProcessor:
     def __init__(self):
@@ -22,41 +24,42 @@ class WuBaProcessor:
         """
         soup_detail = BeautifulSoup(detail_text, 'lxml')
         data_dict = dict()
-        try:
-            data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
-            data_dict['tags'] = [span.text for span in
-                                 soup_detail.find('div', class_='house-title').find('p').find_all('span')]
-            data_dict['money_month'] = ''.join(
-                [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
-            data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
-            data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
-            data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
-            data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
-            data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
-            data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
-            data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
-            data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
-                                  in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
-            data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
-                                    soup_detail.find_all('div', class_='des-item')]
-            data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
-            data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
-                                 soup_detail.find('ul', class_='general-pic-list').find_all('img')]
-            data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
-            data_dict['url'] = parse.unquote(detail_url)
+        # try:
+        data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
+        data_dict['tags'] = [span.text for span in
+                             soup_detail.find('div', class_='house-title').find('p').find_all('span')]
+        data_dict['money_month'] = ''.join(
+            [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
+        data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
+        data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
+        data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
+        data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
+        data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
+        data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
+        data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
+        data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
+                              in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
+        data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
+                                soup_detail.find_all('div', class_='des-item')]
+        data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
+        data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
+                             soup_detail.find('ul', class_='general-pic-list').find_all('img')]
+        data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
+        data_dict['url'] = parse.unquote(detail_url)
 
-            data_dict_json = json.loads(
-                re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
-            # 手机号转文本格式
-            data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
-            # 提取经纬度
-            temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
-            data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
-            # 调用高德API获取量化周边数据
-            data_dict_json.update(get_zhoubian(data_dict_json['location']))
-            self.logger.info(data_dict_json)
-            # 持久化
-            self.store.run(data_dict_json)
-        except Exception as e:
-            self.logger.error(detail_url)
-            self.logger.error(e)
+        data_dict_json = json.loads(
+            re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
+        # 手机号转文本格式
+        data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
+        # 提取经纬度
+        temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
+        # data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
+        data_dict_json['location'] = f"{float(temp_list[0])},{float(temp_list[1])}" if temp_list else None
+        # 调用高德API获取量化周边数据
+        data_dict_json.update(get_zhoubian(data_dict_json['location']))
+        self.logger.info(current_thread().name+str(data_dict_json))
+        # 持久化
+        self.store.run(data_dict_json)
+        # except Exception as e:
+        #     self.logger.error(detail_url)
+        #     self.logger.error(e)

+ 7 - 7
setting.py

@@ -13,21 +13,21 @@ LOG_FILE = '58spider.log'
 LOG_FORMAT = '%(asctime)s - %(filename)s - %(lineno)d - %(name)s - %(levelname)s - %(message)s'
 
 # 爬虫的并发请求数量
-CONCURRENT_REQUESTS = 5
+CONCURRENT_REQUESTS = 3
 
 # 爬虫的User-Agent
 USER_AGENT = [
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
-    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
+    # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
+    # "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
+    # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
 ]
 
 # 爬虫的IP代理
-DOWNLOADER_IP_PROXY = 'https://api.xiaoxiangdaili.com/ip/get?appKey=1111188555084222464&appSecret=Z6eCYDT6&cnt=1&wt=json&method=http&city=&province='
+DOWNLOADER_IP_PROXY = 'https://api.xiaoxiangdaili.com/ip/get?appKey=1114815955776983040&appSecret=M4RS3NpN&cnt=2&wt=json&method=http&city=&province='
 
 # 区名
-_QU = 'laoshan'
+_QU = 'jimo'
 # 面积
 _AREA = '100_300'
 # 房租
@@ -45,7 +45,7 @@ CSV_DIR = 'output_csv'
 CSV_FILENAME = f'{_QU}_{_AREA}_{_MONEY}_{int(time.time() * 1000)}.csv'
 
 # 高德web服务API-key
-GAODE_KEY = '819ddb3d210100e53d651dbc7ae8f11b'
+GAODE_KEY = '515a64d5324a70ba9c5a95f9539370ec'
 
 # 量化-半径
 BANJING = '750'

+ 34 - 16
spider.py

@@ -1,11 +1,12 @@
 import os
+from threading import current_thread
 import requests
 from bs4 import BeautifulSoup
 from processor import WuBaProcessor
 import time
 import random
 from verify.yidun import YiDun
-# from threading import Lock
+from threading import Thread
 from queue import Queue
 from log import PPLogger
 from setting import CONCURRENT_REQUESTS, USER_AGENT, DOWNLOADER_IP_PROXY, START_URL
@@ -37,11 +38,14 @@ class WuBaSpider:
         :return:
         """
         if session_:
-            self.ip_pool.remove(session_.proxies)
-            self.logger.info(f'ip_pool remove {session_.proxies}')
+            try:
+                self.ip_pool.remove(session_.proxies)
+            except Exception as e:
+                pass
+            self.logger.debug(f'ip_pool remove {session_.proxies}')
         if not self.ip_pool:
             res_ip = requests.get(self.ip_proxy).json()
-            self.logger.info(res_ip)
+            self.logger.debug(res_ip)
             if res_ip['code'] != 200:
                 os._exit(0)  # 代理ip请求失败,结束程序
             for i in res_ip['data']:
@@ -102,13 +106,20 @@ class WuBaSpider:
         # 易盾检测
         if 'verifycode' in res_page.url:
             session = self.yidun.verify(session, res_page.url)
-            res_page = session.get(page_url, headers=self.headers, allow_redirects=True, timeout=10)
+            try:
+                res_page = session.get(page_url, headers=self.headers, allow_redirects=True, timeout=10)
+            except Exception as e:
+                self.logger.error(e)
+                session = self.new_session(session)
+                res_page = session.get(page_url, headers=self.headers, allow_redirects=True, timeout=10)
 
         soup_page = BeautifulSoup(res_page.text, 'lxml')
-        detail_urls = [li.find('a').attrs['href'] for li in soup_page.find('div', class_='content-side-left').find_all('li')]
-        for detail_url in detail_urls:
-            self.q.put_nowait(detail_url)  # 加入队列
-            self.logger.info(f'detail_url {detail_url}')
+        if soup_page.find('div', class_='content-side-left'):
+            detail_urls = [li.find('a').attrs['href'] for li in soup_page.find('div', class_='content-side-left').find_all('li')]
+
+            for detail_url in detail_urls:
+                self.q.put_nowait(detail_url)  # 加入队列
+                self.logger.debug(f'detail_url {self.q.qsize()}')
 
     def get_content(self):
         """
@@ -116,9 +127,11 @@ class WuBaSpider:
         """
         session = self.new_session()
         while True:
+            self.logger.debug(f'剩余 {self.q.qsize()}')
             if self.q.empty():
-                time.sleep(5)  # 无数据等待5秒再退出
+                time.sleep(10)  # 无数据等待10秒再退出
                 if self.q.empty():
+                    self.logger.info(current_thread().name+' quit')
                     break
             detail_url = self.q.get()
             self.headers['user-agent'] = random.choice(self.ua)
@@ -130,7 +143,7 @@ class WuBaSpider:
                 self.q.put_nowait(detail_url)  # 请求失败后,url放回队列
                 continue
             else:
-                self.logger.info(res_detail.url)
+                self.logger.debug(res_detail.url)
                 # 极验检测
                 if 'geetest' in res_detail.text:
                     self.q.put_nowait(detail_url)
@@ -150,15 +163,20 @@ class WuBaSpider:
                     self.q.put_nowait(detail_url)
                     continue
                 # with self.lock:  # 文件写入锁
-                self.psr.processor(res_detail.text, res_detail.url)
+                try:
+                    self.psr.processor(res_detail.text, res_detail.url)
+                except Exception as e:
+                    self.logger.error(str(e)+res_detail.url)
+                    # self.q.put_nowait(detail_url)
 
     def start(self):
         page_urls = self.get_page_urls(START_URL)
+        # for page_url in page_urls:
+        #     self.get_detail_urls(page_url)
         with ThreadPoolExecutor() as pool:
-            results1 = [pool.submit(self.get_detail_urls, page_url) for page_url in page_urls[:1]]
-            while self.q.empty():
-                time.sleep(1)
-            results2 = [pool.submit(self.get_content) for _ in range(CONCURRENT_REQUESTS)]
+            results1 = [pool.submit(self.get_detail_urls, page_url) for page_url in page_urls]
+        for _ in range(CONCURRENT_REQUESTS):
+            Thread(target=self.get_content).start()
 
 
 

+ 2 - 0
store.py

@@ -3,6 +3,8 @@ import pandas as pd
 from log import PPLogger
 from setting import CSV_FILENAME, CSV_DIR
 from threading import Lock
+pd.options.display.max_colwidth = 100000
+
 
 class WuBaStore:
     def __init__(self, method):