1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- import json
- import re
- from log import PPLogger
- from bs4 import BeautifulSoup
- from store import WuBaStore
- from setting import STORE_METHOD
- from urllib import parse
- from gaode_api import get_zhoubian
- from threading import current_thread
- class WuBaProcessor:
- def __init__(self):
- self.logger = PPLogger(name='58processor')
- self.logger.setup_logger()
- self.store = WuBaStore(STORE_METHOD)
- def processor(self, detail_text, detail_url):
- """
- 数据处理
- :param detail_text: 商铺页面html
- :param detail_url: 商铺页面url
- :return:
- """
- soup_detail = BeautifulSoup(detail_text, 'lxml')
- data_dict = dict()
- # try:
- data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
- data_dict['tags'] = [span.text for span in
- soup_detail.find('div', class_='house-title').find('p').find_all('span')]
- data_dict['money_month'] = ''.join(
- [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
- data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
- data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
- data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
- data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
- data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
- data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
- data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
- data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
- in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
- data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
- soup_detail.find_all('div', class_='des-item')]
- data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
- data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
- soup_detail.find('ul', class_='general-pic-list').find_all('img')]
- data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
- data_dict['url'] = parse.unquote(detail_url)
- data_dict_json = json.loads(
- re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
- # 手机号转文本格式
- data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
- # 提取经纬度
- temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
- # data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
- data_dict_json['location'] = f"{float(temp_list[0])},{float(temp_list[1])}" if temp_list else None
- # 调用高德API获取量化周边数据
- data_dict_json.update(get_zhoubian(data_dict_json['location']))
- self.logger.info(current_thread().name+str(data_dict_json))
- # 持久化
- self.store.run(data_dict_json)
- # except Exception as e:
- # self.logger.error(detail_url)
- # self.logger.error(e)
|