ai客服
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
12 KiB

1 month ago
  1. import time
  2. import re
  3. from playwright.sync_api import Page
  4. def scrape_products(page: Page):
  5. """
  6. """
  7. try:
  8. print(">> 正在尝试自动切换到“商品”标签页...")
  9. try:
  10. # 锁定精准类名 Qk7Fc20IPHKGdyq8SdNw
  11. target_tab = page.locator("div.Qk7Fc20IPHKGdyq8SdNw:has-text('商品')").first
  12. if target_tab.count() > 0:
  13. target_tab.hover()
  14. target_tab.click(force=True)
  15. print(" -> 已点击“商品”标签。")
  16. else:
  17. # 兜底:直接按文本点击
  18. page.get_by_text("商品", exact=True).first.click(force=True)
  19. print(" -> 已执行兜底点击。")
  20. # --- 核心优化 1:等待商品列表出现 ---
  21. print(" -> 正在等待商品列表渲染 (最长10秒)...")
  22. start_wait = time.time()
  23. found_list = False
  24. while time.time() - start_wait < 10:
  25. # 寻找包含“库存”字样的 div,这是商品列表加载出来的标志
  26. check_cards = page.locator("div").filter(has_text=re.compile(r"库存\s*\d+")).count()
  27. if check_cards > 0:
  28. found_list = True
  29. break
  30. time.sleep(1)
  31. if not found_list:
  32. print(" ⚠️ 商品列表在 10 秒内未渲染,可能页面加载较慢或标签切换未成功。")
  33. else:
  34. print(f" -> 商品列表已就绪,当前可见商品块: {check_cards}")
  35. except Exception as e:
  36. print(f" ⚠️ 切换标签页失败: {e}")
  37. # 2. 深度滚动抓取逻辑
  38. print(" -> 开始执行深度滚动抓取...")
  39. results = []
  40. seen_names = set()
  41. max_scrolls = 40 # 增加最大滚动次数,支持更多商品
  42. scroll_count = 0
  43. consecutive_no_new = 0
  44. has_found_any = False # 标记是否已经抓到过任何东西
  45. while scroll_count < max_scrolls:
  46. # --- 核心优化 1:使用特征属性定位商品卡片 ---
  47. # 飞鸽页面中,商品卡片通常带有 data-btm-id,且包含 .card_ 字样
  48. # 如果没有,则使用包含“库存”字样的 div 作为基准
  49. card_locators = [
  50. page.locator("div[data-btm-id*='card_']"),
  51. page.locator("div._2a3389bc1da255c500cb6d07f312c984-less"), # 用户提供的特定类名
  52. page.locator("div").filter(has_text=re.compile(r"库存\s*\d+"))
  53. ]
  54. current_cards = []
  55. for loc in card_locators:
  56. all_found = loc.all()
  57. if len(all_found) > 0: # 只要抓到,不限数量
  58. current_cards = all_found
  59. break
  60. new_found_this_round = 0
  61. for card in current_cards:
  62. try:
  63. # 获取卡片的 inner_text,如果太短则尝试溯源到父级
  64. inner_text = card.inner_text()
  65. # 鲁棒性改进:如果卡片本身没抓到“发送”字样,可能是因为定位到了子 div
  66. # 尝试向上寻找直到找到包含“发送”的父级
  67. if "发送" not in inner_text and "库存" in inner_text:
  68. parent = card
  69. for _ in range(3): # 最多向上找 3 层
  70. parent = parent.locator("xpath=..")
  71. p_text = parent.inner_text()
  72. if "发送" in p_text:
  73. inner_text = p_text
  74. break
  75. # 最终检查:必须包含关键特征
  76. if "库存" not in inner_text:
  77. continue
  78. # 对 2,533 这种带逗号的数字进行清理
  79. clean_text = inner_text.replace(",", "")
  80. # 提取名称:取第一行或非空长行
  81. lines = [l.strip() for l in inner_text.split("\n") if len(l.strip()) > 3]
  82. junk_words = ["全部商品", "浏览足迹", "爆品推荐", "订单", "会话搜索", "快捷短语", "保障", "物流", "复制", "发送"]
  83. name = ""
  84. for l in lines:
  85. if not any(j in l for j in junk_words) and "库存" not in l and "" not in l and "已售" not in l:
  86. name = l
  87. break
  88. if not name or name in seen_names:
  89. continue
  90. seen_names.add(name)
  91. new_found_this_round += 1
  92. has_found_any = True
  93. # 提取图片 (支持 background-image 和 img 标签)
  94. img_url = ""
  95. img_div = card.locator("div[style*='background-image']").first
  96. if img_div.count() > 0:
  97. style = img_div.get_attribute("style")
  98. match = re.search(r'url\("?([^"\)]+)"?\)', style)
  99. if match: img_url = match.group(1)
  100. if not img_url:
  101. img_el = card.locator("img").first
  102. if img_el.count() > 0: img_url = img_el.get_attribute("src")
  103. # 解析价格与库存
  104. price = 0
  105. price_match = re.search(r"\s*([\d\.]+)", inner_text)
  106. if price_match: price = float(price_match.group(1))
  107. stock_match = re.search(r"库存\s*([\d,]+)", inner_text)
  108. if stock_match:
  109. stock_str = stock_match.group(1).replace(",", "")
  110. stock = int(stock_str)
  111. status = "现货"
  112. if "预售" in inner_text or "预热" in inner_text: status = "预售"
  113. delivery_time = ""
  114. # 增强发货时间正则:支持“最晚 48小时内发货”、“15天内发货”等
  115. delivery_match = re.search(r"((?:最晚|现在付款,)?(?:明天|后天|[\d]+小时内|[\d]+天内)发货)", inner_text)
  116. if delivery_match: delivery_time = delivery_match.group(1)
  117. item_data = {
  118. "name": name, "img_url": img_url, "stock": stock,
  119. "status": status, "delivery_time": delivery_time, "price": price
  120. }
  121. results.append(item_data)
  122. print(f" [抓取成功] {name[:10]}... | 库存:{stock} | 时间:{delivery_time}")
  123. except:
  124. continue
  125. # --- 核心优化 2:更稳健的退出判定 ---
  126. if new_found_this_round == 0:
  127. # 只有在已经抓到过东西的前提下,才开始累计“到底”计数
  128. if has_found_any:
  129. consecutive_no_new += 1
  130. else:
  131. consecutive_no_new = 0
  132. if consecutive_no_new >= 2: # 连续两次滚动没有新商品,说明终于到底了
  133. print(" -> 连续两轮未发现新商品,判定已到达列表底部。")
  134. break
  135. # 执行滚动:优先尝试滚动到当前视图的最后一个元素,再执行容器滚动
  136. try:
  137. if current_cards:
  138. last_card = current_cards[-1]
  139. last_card.scroll_into_view_if_needed()
  140. time.sleep(0.5)
  141. page.evaluate('''() => {
  142. const selectors = [
  143. '.auxo-tabs-content-active',
  144. '.arco-tabs-content',
  145. '.scrollbar-container',
  146. '.auxo-list-items',
  147. '.auxo-spin-container',
  148. '[class*="tabs-content"]'
  149. ];
  150. let scrolled = false;
  151. for (let sel of selectors) {
  152. const els = document.querySelectorAll(sel);
  153. for (let el of els) {
  154. if (el && el.scrollHeight > el.clientHeight) {
  155. // 1200
  156. el.scrollTop += 1200;
  157. scrolled = true;
  158. }
  159. }
  160. }
  161. if (!scrolled) window.scrollBy(0, 1000);
  162. }''')
  163. # 等待加载:给予更多缓冲时间让新数据渲染
  164. time.sleep(2.5)
  165. except Exception as e:
  166. print(f" ⚠️ 执行滚动动作时出错: {e}")
  167. scroll_count += 1
  168. print(f" -> 滚动中 ({scroll_count}/{max_scrolls})... 当前新发现 {new_found_this_round} 个商品")
  169. print(f">> 深度抓取同步完成,共抓取到 {len(results)} 个唯一商品。")
  170. return results
  171. except Exception as e:
  172. print(f"[Scraper] 抓取发生异常: {e}")
  173. return []
  174. def send_product_link_via_ui(page: Page, keyword: str):
  175. """
  176. UI
  177. """
  178. try:
  179. if not keyword:
  180. return False
  181. print(f">> [UI动作] 尝试为客户发送商品卡片: {keyword}")
  182. # 1. 切换到“商品”标签页
  183. target_tab = page.locator("div.Qk7Fc20IPHKGdyq8SdNw:has-text('商品')").first
  184. if target_tab.count() > 0:
  185. target_tab.click(force=True)
  186. time.sleep(1)
  187. # 2. 在搜索框中输入关键词
  188. search_input = page.locator("input.auxo-input[placeholder*='输入商品名称']").first
  189. if search_input.count() > 0:
  190. search_input.fill("")
  191. search_input.type(keyword, delay=50)
  192. search_input.press("Enter")
  193. print(f" -> 已输入关键词并回车,等待结果加载...")
  194. time.sleep(3) # 给更多时间加载搜索结果
  195. # 3. 寻找并点击第一个“发送商品”按钮
  196. # 核心优化:使用 JS 注入在当前视图中寻找类名为 auxo-btn-dashed 且包含“发送商品”文本的按钮
  197. # 这种方式比常规 Selector 更能抵抗 DOM 结构波动
  198. click_success = page.evaluate('''(keyword) => {
  199. const buttons = Array.from(document.querySelectorAll('button.auxo-btn-dashed'));
  200. const sendBtn = buttons.find(btn => btn.innerText.includes('发送商品'));
  201. if (sendBtn) {
  202. sendBtn.scrollIntoView({ behavior: 'smooth', block: 'center' });
  203. //
  204. return new Promise(resolve => {
  205. setTimeout(() => {
  206. sendBtn.click();
  207. resolve(true);
  208. }, 500);
  209. });
  210. }
  211. return false;
  212. }''', keyword)
  213. if click_success:
  214. print(f" -> [成功] 已通过 JS 触发“发送商品”点击 (关键词: {keyword})。")
  215. time.sleep(2) # 点击后留足发送时间
  216. else:
  217. print(f" -> [失败] 页面上未发现可点击的“发送商品”按钮。")
  218. else:
  219. print(" ⚠️ 未能在页面上找到商品搜索框。")
  220. # 4. 复位:返回订单标签页
  221. time.sleep(1) # 复位前稍等
  222. page.locator("div.Qk7Fc20IPHKGdyq8SdNw:has-text('订单')").first.click(force=True)
  223. return True
  224. except Exception as e:
  225. print(f">> [UI动作] 执行发送商品动作失败: {e}")
  226. return False