import json import re from log import PPLogger from bs4 import BeautifulSoup from store import WuBaStore from setting import STORE_METHOD from urllib import parse from gaode_api import get_zhoubian from threading import current_thread class WuBaProcessor: def __init__(self): self.logger = PPLogger(name='58processor') self.logger.setup_logger() self.store = WuBaStore(STORE_METHOD) def processor(self, detail_text, detail_url): """ 数据处理 :param detail_text: 商铺页面html :param detail_url: 商铺页面url :return: """ soup_detail = BeautifulSoup(detail_text, 'lxml') data_dict = dict() # try: data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text data_dict['tags'] = [span.text for span in soup_detail.find('div', class_='house-title').find('p').find_all('span')] data_dict['money_month'] = ''.join( [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]]) data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1) data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li in soup_detail.find('ul', class_='general-item-wrap').find_all('li')] data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in soup_detail.find_all('div', class_='des-item')] data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')] data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in soup_detail.find('ul', class_='general-pic-list').find_all('img')] data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content'] data_dict['url'] = parse.unquote(detail_url) data_dict_json = json.loads( re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"')) # 手机号转文本格式 data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打' # 提取经纬度 temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None # data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None data_dict_json['location'] = f"{float(temp_list[0])},{float(temp_list[1])}" if temp_list else None # 调用高德API获取量化周边数据 data_dict_json.update(get_zhoubian(data_dict_json['location'])) self.logger.info(current_thread().name+str(data_dict_json)) # 持久化 self.store.run(data_dict_json) # except Exception as e: # self.logger.error(detail_url) # self.logger.error(e)