|
|
@@ -6,6 +6,8 @@ from store import WuBaStore
|
|
|
from setting import STORE_METHOD
|
|
|
from urllib import parse
|
|
|
from gaode_api import get_zhoubian
|
|
|
+from threading import current_thread
|
|
|
+
|
|
|
|
|
|
class WuBaProcessor:
|
|
|
def __init__(self):
|
|
|
@@ -22,41 +24,42 @@ class WuBaProcessor:
|
|
|
"""
|
|
|
soup_detail = BeautifulSoup(detail_text, 'lxml')
|
|
|
data_dict = dict()
|
|
|
- try:
|
|
|
- data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
|
|
|
- data_dict['tags'] = [span.text for span in
|
|
|
- soup_detail.find('div', class_='house-title').find('p').find_all('span')]
|
|
|
- data_dict['money_month'] = ''.join(
|
|
|
- [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
|
|
|
- data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
|
|
|
- data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
|
|
|
- data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
|
|
|
- data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
|
|
|
- data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
|
|
|
- data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
|
|
|
- data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
|
|
|
- data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
|
|
|
- in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
|
|
|
- data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
|
|
|
- soup_detail.find_all('div', class_='des-item')]
|
|
|
- data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
|
|
|
- data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
|
|
|
- soup_detail.find('ul', class_='general-pic-list').find_all('img')]
|
|
|
- data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
|
|
|
- data_dict['url'] = parse.unquote(detail_url)
|
|
|
+ # try:
|
|
|
+ data_dict['title'] = soup_detail.find('div', class_='house-title').find('h1').text
|
|
|
+ data_dict['tags'] = [span.text for span in
|
|
|
+ soup_detail.find('div', class_='house-title').find('p').find_all('span')]
|
|
|
+ data_dict['money_month'] = ''.join(
|
|
|
+ [span.text for span in soup_detail.find('p', class_='house_basic_title_money').find_all('span')[:2]])
|
|
|
+ data_dict['money_day'] = soup_detail.find('p', class_='house_basic_title_money').find_all('span')[2].text
|
|
|
+ data_dict['area'] = soup_detail.find('p', class_='house_basic_title_info').find('span').text
|
|
|
+ data_dict['type'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[1].text
|
|
|
+ data_dict['qizuqi'] = soup_detail.find('p', class_='house_basic_title_info').find_all('span')[2].text
|
|
|
+ data_dict['address'] = soup_detail.find('h3', class_='general-weizhi-title').text
|
|
|
+ data_dict['poster_name'] = soup_detail.find('span', class_='name-text').text
|
|
|
+ data_dict['poster_phone'] = re.search(r'"phone":"(.*?)",', detail_text).group(1)
|
|
|
+ data_dict['intro'] = [{li.find('span', class_='title').text: li.find('span', class_='content').text} for li
|
|
|
+ in soup_detail.find('ul', class_='general-item-wrap').find_all('li')]
|
|
|
+ data_dict['miaoshu'] = [{div.find('p').text: div.find('article').text} for div in
|
|
|
+ soup_detail.find_all('div', class_='des-item')]
|
|
|
+ data_dict['peitao'] = [li.text for li in soup_detail.find_all('li', class_='peitao-on')]
|
|
|
+ data_dict['pics'] = [parse.unquote(img.attrs['src']) for img in
|
|
|
+ soup_detail.find('ul', class_='general-pic-list').find_all('img')]
|
|
|
+ data_dict['location'] = soup_detail.find('meta', attrs={'name': 'location'}).attrs['content']
|
|
|
+ data_dict['url'] = parse.unquote(detail_url)
|
|
|
|
|
|
- data_dict_json = json.loads(
|
|
|
- re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
|
|
|
- # 手机号转文本格式
|
|
|
- data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
|
|
|
- # 提取经纬度
|
|
|
- temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
|
|
|
- data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
|
|
|
- # 调用高德API获取量化周边数据
|
|
|
- data_dict_json.update(get_zhoubian(data_dict_json['location']))
|
|
|
- self.logger.info(data_dict_json)
|
|
|
- # 持久化
|
|
|
- self.store.run(data_dict_json)
|
|
|
- except Exception as e:
|
|
|
- self.logger.error(detail_url)
|
|
|
- self.logger.error(e)
|
|
|
+ data_dict_json = json.loads(
|
|
|
+ re.sub(r'\\n|\s|\\xa0|询问卖方心理预期?|询问具体转让内容?|位置-', '', str(data_dict)).replace("'", '"'))
|
|
|
+ # 手机号转文本格式
|
|
|
+ data_dict_json['poster_phone'] = data_dict_json['poster_phone']+'\t' if data_dict_json['poster_phone'] else '未公开手机号,点击待租商铺链接调用拨打'
|
|
|
+ # 提取经纬度
|
|
|
+ temp_list = data_dict_json['location'].split('=')[-1].split(',') if data_dict_json['location'] else None
|
|
|
+ # data_dict_json['location'] = f"{float(temp_list[0]):.6f},{float(temp_list[0]):.6f}" if temp_list else None
|
|
|
+ data_dict_json['location'] = f"{float(temp_list[0])},{float(temp_list[1])}" if temp_list else None
|
|
|
+ # 调用高德API获取量化周边数据
|
|
|
+ data_dict_json.update(get_zhoubian(data_dict_json['location']))
|
|
|
+ self.logger.info(current_thread().name+str(data_dict_json))
|
|
|
+ # 持久化
|
|
|
+ self.store.run(data_dict_json)
|
|
|
+ # except Exception as e:
|
|
|
+ # self.logger.error(detail_url)
|
|
|
+ # self.logger.error(e)
|