processor.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import json
  2. import re
  3. from log import PPLogger
  4. from bs4 import BeautifulSoup
  5. from store import WuBaStore
  6. from setting import STORE_METHOD
  7. from urllib import parse
  8. from gaode_api import get_zhoubian
  9. from threading import current_thread
  10. class WuBaProcessor:
  11. def __init__(self):
  12. self.logger = PPLogger(name='58processor')
  13. self.logger.setup_logger()
  14. self.store = WuBaStore(STORE_METHOD)
  15. def processor(self, detail_text, detail_url):
  16. """
  17. 数据处理
  18. :param detail_text: 商铺页面html
  19. :param detail_url: 商铺页面url
  20. :return:
  21. """
  22. soup_detail = BeautifulSoup(detail_text, 'lxml')
  23. data_dict = dict()
  24. # try:
  25. data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
  26. data_dict['tags'] = [span.text for span in
  27. soup_detail.find('div', class_='house-title').find('p').find_all('span')]
  28. data_dict['money_month'] = ''.join(
  29. [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
  30. data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
  31. data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
  32. data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
  33. data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
  34. data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
  35. data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
  36. data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
  37. data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
  38. in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
  39. data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
  40. soup_detail.find_all('div', class_='des-item')]
  41. data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
  42. data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
  43. soup_detail.find('ul', class_='general-pic-list').find_all('img')]
  44. data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
  45. data_dict['url'] = parse.unquote(detail_url)
  46. data_dict_json = json.loads(
  47. re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
  48. # 手机号转文本格式
  49. data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
  50. # 提取经纬度
  51. temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
  52. # data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
  53. data_dict_json['location'] = f"{float(temp_list[0])},{float(temp_list[1])}" if temp_list else None
  54. # 调用高德API获取量化周边数据
  55. data_dict_json.update(get_zhoubian(data_dict_json['location']))
  56. self.logger.info(current_thread().name+str(data_dict_json))
  57. # 持久化
  58. self.store.run(data_dict_json)
  59. # except Exception as e:
  60. # self.logger.error(detail_url)
  61. # self.logger.error(e)