Bladeren bron

更新配置

Your Name 1 jaar geleden
bovenliggende
commit
f7459ebd60
6 gewijzigde bestanden met toevoegingen van 106 en 71 verwijderingen
  1. 1 1
      README.md
  2. 22 10
      gaode_api.py
  3. 40 37
      processor.py
  4. 7 7
      setting.py
  5. 34 16
      spider.py
  6. 2 0
      store.py

+ 1 - 1
README.md

@@ -4,6 +4,6 @@
 4.store.py --数据持久化
 4.store.py --数据持久化
 5.log.py --日志配置
 5.log.py --日志配置
 6.setting.py --爬虫配置
 6.setting.py --爬虫配置
-
+7.gaode_api.py --高德API
 
 
 pip install -r requirements.txt
 pip install -r requirements.txt

+ 22 - 10
gaode_api.py

@@ -7,7 +7,10 @@ from log import PPLogger
 
 
 logger = PPLogger(name='gaode_api')
 logger = PPLogger(name='gaode_api')
 logger.setup_logger()
 logger.setup_logger()
+
+
 def get_zhoubian(location=None):
 def get_zhoubian(location=None):
+    # print(location)
     data_dict = {'pipei': False, 'zhuzhai': [], 'xuexiao': [], 'xiezilou': [], 'jingpin': [], 'liansuo': []}
     data_dict = {'pipei': False, 'zhuzhai': [], 'xuexiao': [], 'xiezilou': [], 'jingpin': [], 'liansuo': []}
     if not location:
     if not location:
         return data_dict
         return data_dict
@@ -18,16 +21,20 @@ def get_zhoubian(location=None):
         'keywords': keyword,
         'keywords': keyword,
         'location': location,
         'location': location,
         'radius': str(BANJING),
         'radius': str(BANJING),
+        'region': '370200',
+        'city_limit': 'true',
         'show_fields': 'business',
         'show_fields': 'business',
-        'page_size': '50'
+        # 'page_size': '50'
     } for keyword in keywords]
     } for keyword in keywords]
     params = {
     params = {
         'key': GAODE_KEY,
         'key': GAODE_KEY,
         'types': '080113|120201|120202|120203|120300|141201|141202|141206',
         'types': '080113|120201|120202|120203|120300|141201|141202|141206',
         'location': location,
         'location': location,
         'radius': str(BANJING),
         'radius': str(BANJING),
+        'region': '370200',
+        'city_limit': 'true',
         'show_fields': 'business',
         'show_fields': 'business',
-        'page_size': '50'
+        # 'page_size': '50'
     }
     }
     results_liansuo = []
     results_liansuo = []
     with ThreadPoolExecutor() as pool:
     with ThreadPoolExecutor() as pool:
@@ -43,26 +50,31 @@ def get_zhoubian(location=None):
         return data_dict
         return data_dict
 
 
     for liansuo in results_liansuo:
     for liansuo in results_liansuo:
-        data_dict['liansuo'].extend(liansuo['pois'])
+        try:
+            for pois in liansuo['pois']:
+                # print(pois)
+                data_dict['liansuo'].append({'name':pois['name'], 'address':pois['address'], 'distance':pois['distance']})
+        except Exception as e:
+            logger.error(e, liansuo)
     for qita in results_qita['pois']:
     for qita in results_qita['pois']:
         if '写字楼' in qita['type']:
         if '写字楼' in qita['type']:
-            data_dict['xiezilou'].append(qita)
+            data_dict['xiezilou'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         elif '住宅' in qita['type']:
         elif '住宅' in qita['type']:
-            data_dict['zhuzhai'].append(qita)
+            data_dict['zhuzhai'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         elif '学校' in qita['type']:
         elif '学校' in qita['type']:
-            data_dict['xuexiao'].append(qita)
+            data_dict['xuexiao'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         elif '台球' in qita['type']:
         elif '台球' in qita['type']:
-            data_dict['jingpin'].append(qita)
+            data_dict['jingpin'].append({'name':qita['name'], 'address':qita['address'], 'distance':qita['distance']})
         else:
         else:
             pass
             pass
 
 
     if (len(data_dict['zhuzhai']) >= int(ZHUZHAI_COUNT) or len(data_dict['xuexiao']) >= int(XUEXIAO_COUNT) or len(
     if (len(data_dict['zhuzhai']) >= int(ZHUZHAI_COUNT) or len(data_dict['xuexiao']) >= int(XUEXIAO_COUNT) or len(
             data_dict['xiezilou']) >= int(XIEZILOU_COUNT)) \
             data_dict['xiezilou']) >= int(XIEZILOU_COUNT)) \
-            and len(data_dict['jingpin']) <= int(LIANSUO_COUNT) \
-            and len(data_dict['liansuo']) >= int(JINGPIN_COUNT):
+            and len(data_dict['jingpin']) <= int(JINGPIN_COUNT) \
+            and len(data_dict['liansuo']) >= int(LIANSUO_COUNT):
         data_dict['pipei'] = True
         data_dict['pipei'] = True
 
 
     return data_dict
     return data_dict
 
 
 
 
-# print(get_zhoubian('120.42716445444404,36.165840548830225'))
+# print(get_zhoubian('120.469038,36.398833'))

+ 40 - 37
processor.py

@@ -6,6 +6,8 @@ from store import WuBaStore
 from setting import STORE_METHOD
 from setting import STORE_METHOD
 from urllib import parse
 from urllib import parse
 from gaode_api import get_zhoubian
 from gaode_api import get_zhoubian
+from threading import current_thread
+
 
 
 class WuBaProcessor:
 class WuBaProcessor:
     def __init__(self):
     def __init__(self):
@@ -22,41 +24,42 @@ class WuBaProcessor:
         """
         """
         soup_detail = BeautifulSoup(detail_text, 'lxml')
         soup_detail = BeautifulSoup(detail_text, 'lxml')
         data_dict = dict()
         data_dict = dict()
-        try:
-            data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
-            data_dict['tags'] = [span.text for span in
-                                 soup_detail.find('div', class_='house-title').find('p').find_all('span')]
-            data_dict['money_month'] = ''.join(
-                [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
-            data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
-            data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
-            data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
-            data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
-            data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
-            data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
-            data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
-            data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
-                                  in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
-            data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
-                                    soup_detail.find_all('div', class_='des-item')]
-            data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
-            data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
-                                 soup_detail.find('ul', class_='general-pic-list').find_all('img')]
-            data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
-            data_dict['url'] = parse.unquote(detail_url)
+        # try:
+        data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
+        data_dict['tags'] = [span.text for span in
+                             soup_detail.find('div', class_='house-title').find('p').find_all('span')]
+        data_dict['money_month'] = ''.join(
+            [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
+        data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
+        data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
+        data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
+        data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
+        data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
+        data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
+        data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
+        data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
+                              in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
+        data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
+                                soup_detail.find_all('div', class_='des-item')]
+        data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
+        data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
+                             soup_detail.find('ul', class_='general-pic-list').find_all('img')]
+        data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
+        data_dict['url'] = parse.unquote(detail_url)
 
 
-            data_dict_json = json.loads(
-                re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
-            # 手机号转文本格式
-            data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
-            # 提取经纬度
-            temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
-            data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
-            # 调用高德API获取量化周边数据
-            data_dict_json.update(get_zhoubian(data_dict_json['location']))
-            self.logger.info(data_dict_json)
-            # 持久化
-            self.store.run(data_dict_json)
-        except Exception as e:
-            self.logger.error(detail_url)
-            self.logger.error(e)
+        data_dict_json = json.loads(
+            re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
+        # 手机号转文本格式
+        data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
+        # 提取经纬度
+        temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
+        # data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
+        data_dict_json['location'] = f"{float(temp_list[0])},{float(temp_list[1])}" if temp_list else None
+        # 调用高德API获取量化周边数据
+        data_dict_json.update(get_zhoubian(data_dict_json['location']))
+        self.logger.info(current_thread().name+str(data_dict_json))
+        # 持久化
+        self.store.run(data_dict_json)
+        # except Exception as e:
+        #     self.logger.error(detail_url)
+        #     self.logger.error(e)

+ 7 - 7
setting.py

@@ -13,21 +13,21 @@ LOG_FILE = '58spider.log'
 LOG_FORMAT = '%(asctime)s - %(filename)s - %(lineno)d - %(name)s - %(levelname)s - %(message)s'
 LOG_FORMAT = '%(asctime)s - %(filename)s - %(lineno)d - %(name)s - %(levelname)s - %(message)s'
 
 
 # 爬虫的并发请求数量
 # 爬虫的并发请求数量
-CONCURRENT_REQUESTS = 5
+CONCURRENT_REQUESTS = 3
 
 
 # 爬虫的User-Agent
 # 爬虫的User-Agent
 USER_AGENT = [
 USER_AGENT = [
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
-    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
+    # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
+    # "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
+    # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
 ]
 ]
 
 
 # 爬虫的IP代理
 # 爬虫的IP代理
-DOWNLOADER_IP_PROXY = 'https://api.xiaoxiangdaili.com/ip/get?appKey=1111188555084222464&appSecret=Z6eCYDT6&cnt=1&wt=json&method=http&city=&province='
+DOWNLOADER_IP_PROXY = 'https://api.xiaoxiangdaili.com/ip/get?appKey=1114815955776983040&appSecret=M4RS3NpN&cnt=2&wt=json&method=http&city=&province='
 
 
 # 区名
 # 区名
-_QU = 'laoshan'
+_QU = 'jimo'
 # 面积
 # 面积
 _AREA = '100_300'
 _AREA = '100_300'
 # 房租
 # 房租
@@ -45,7 +45,7 @@ CSV_DIR = 'output_csv'
 CSV_FILENAME = f'{_QU}_{_AREA}_{_MONEY}_{int(time.time() * 1000)}.csv'
 CSV_FILENAME = f'{_QU}_{_AREA}_{_MONEY}_{int(time.time() * 1000)}.csv'
 
 
 # 高德web服务API-key
 # 高德web服务API-key
-GAODE_KEY = '819ddb3d210100e53d651dbc7ae8f11b'
+GAODE_KEY = '515a64d5324a70ba9c5a95f9539370ec'
 
 
 # 量化-半径
 # 量化-半径
 BANJING = '750'
 BANJING = '750'

+ 34 - 16
spider.py

@@ -1,11 +1,12 @@
 import os
 import os
+from threading import current_thread
 import requests
 import requests
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 from processor import WuBaProcessor
 from processor import WuBaProcessor
 import time
 import time
 import random
 import random
 from verify.yidun import YiDun
 from verify.yidun import YiDun
-# from threading import Lock
+from threading import Thread
 from queue import Queue
 from queue import Queue
 from log import PPLogger
 from log import PPLogger
 from setting import CONCURRENT_REQUESTS, USER_AGENT, DOWNLOADER_IP_PROXY, START_URL
 from setting import CONCURRENT_REQUESTS, USER_AGENT, DOWNLOADER_IP_PROXY, START_URL
@@ -37,11 +38,14 @@ class WuBaSpider:
         :return:
         :return:
         """
         """
         if session_:
         if session_:
-            self.ip_pool.remove(session_.proxies)
-            self.logger.info(f'ip_pool remove {session_.proxies}')
+            try:
+                self.ip_pool.remove(session_.proxies)
+            except Exception as e:
+                pass
+            self.logger.debug(f'ip_pool remove {session_.proxies}')
         if not self.ip_pool:
         if not self.ip_pool:
             res_ip = requests.get(self.ip_proxy).json()
             res_ip = requests.get(self.ip_proxy).json()
-            self.logger.info(res_ip)
+            self.logger.debug(res_ip)
             if res_ip['code'] != 200:
             if res_ip['code'] != 200:
                 os._exit(0)  # 代理ip请求失败,结束程序
                 os._exit(0)  # 代理ip请求失败,结束程序
             for i in res_ip['data']:
             for i in res_ip['data']:
@@ -102,13 +106,20 @@ class WuBaSpider:
         # 易盾检测
         # 易盾检测
         if 'verifycode' in res_page.url:
         if 'verifycode' in res_page.url:
             session = self.yidun.verify(session, res_page.url)
             session = self.yidun.verify(session, res_page.url)
-            res_page = session.get(page_url, headers=self.headers, allow_redirects=True, timeout=10)
+            try:
+                res_page = session.get(page_url, headers=self.headers, allow_redirects=True, timeout=10)
+            except Exception as e:
+                self.logger.error(e)
+                session = self.new_session(session)
+                res_page = session.get(page_url, headers=self.headers, allow_redirects=True, timeout=10)
 
 
         soup_page = BeautifulSoup(res_page.text, 'lxml')
         soup_page = BeautifulSoup(res_page.text, 'lxml')
-        detail_urls = [li.find('a').attrs['href'] for li in soup_page.find('div', class_='content-side-left').find_all('li')]
-        for detail_url in detail_urls:
-            self.q.put_nowait(detail_url)  # 加入队列
-            self.logger.info(f'detail_url {detail_url}')
+        if soup_page.find('div', class_='content-side-left'):
+            detail_urls = [li.find('a').attrs['href'] for li in soup_page.find('div', class_='content-side-left').find_all('li')]
+
+            for detail_url in detail_urls:
+                self.q.put_nowait(detail_url)  # 加入队列
+                self.logger.debug(f'detail_url {self.q.qsize()}')
 
 
     def get_content(self):
     def get_content(self):
         """
         """
@@ -116,9 +127,11 @@ class WuBaSpider:
         """
         """
         session = self.new_session()
         session = self.new_session()
         while True:
         while True:
+            self.logger.debug(f'剩余 {self.q.qsize()}')
             if self.q.empty():
             if self.q.empty():
-                time.sleep(5)  # 无数据等待5秒再退出
+                time.sleep(10)  # 无数据等待10秒再退出
                 if self.q.empty():
                 if self.q.empty():
+                    self.logger.info(current_thread().name+' quit')
                     break
                     break
             detail_url = self.q.get()
             detail_url = self.q.get()
             self.headers['user-agent'] = random.choice(self.ua)
             self.headers['user-agent'] = random.choice(self.ua)
@@ -130,7 +143,7 @@ class WuBaSpider:
                 self.q.put_nowait(detail_url)  # 请求失败后,url放回队列
                 self.q.put_nowait(detail_url)  # 请求失败后,url放回队列
                 continue
                 continue
             else:
             else:
-                self.logger.info(res_detail.url)
+                self.logger.debug(res_detail.url)
                 # 极验检测
                 # 极验检测
                 if 'geetest' in res_detail.text:
                 if 'geetest' in res_detail.text:
                     self.q.put_nowait(detail_url)
                     self.q.put_nowait(detail_url)
@@ -150,15 +163,20 @@ class WuBaSpider:
                     self.q.put_nowait(detail_url)
                     self.q.put_nowait(detail_url)
                     continue
                     continue
                 # with self.lock:  # 文件写入锁
                 # with self.lock:  # 文件写入锁
-                self.psr.processor(res_detail.text, res_detail.url)
+                try:
+                    self.psr.processor(res_detail.text, res_detail.url)
+                except Exception as e:
+                    self.logger.error(str(e)+res_detail.url)
+                    # self.q.put_nowait(detail_url)
 
 
     def start(self):
     def start(self):
         page_urls = self.get_page_urls(START_URL)
         page_urls = self.get_page_urls(START_URL)
+        # for page_url in page_urls:
+        #     self.get_detail_urls(page_url)
         with ThreadPoolExecutor() as pool:
         with ThreadPoolExecutor() as pool:
-            results1 = [pool.submit(self.get_detail_urls, page_url) for page_url in page_urls[:1]]
-            while self.q.empty():
-                time.sleep(1)
-            results2 = [pool.submit(self.get_content) for _ in range(CONCURRENT_REQUESTS)]
+            results1 = [pool.submit(self.get_detail_urls, page_url) for page_url in page_urls]
+        for _ in range(CONCURRENT_REQUESTS):
+            Thread(target=self.get_content).start()
 
 
 
 
 
 

+ 2 - 0
store.py

@@ -3,6 +3,8 @@ import pandas as pd
 from log import PPLogger
 from log import PPLogger
 from setting import CSV_FILENAME, CSV_DIR
 from setting import CSV_FILENAME, CSV_DIR
 from threading import Lock
 from threading import Lock
+pd.options.display.max_colwidth = 100000
+
 
 
 class WuBaStore:
 class WuBaStore:
     def __init__(self, method):
     def __init__(self, method):