ederiz hemen üzerinde yorum için @SpaceDog, ben benzer bir sorun karşı karşıya ntlm kimlik doğrulaması kullanarak bir intranet web sitesini taramaya çalışıyor. Tarayıcı, CrawlSpider içindeki LinkExtractor'ın patlamaması nedeniyle ilk sayfayı görecekti.
İşte kullanarak çalışma çözüm scrapy var 1.0.5
NTLM_Middleware.py
from scrapy.http import Response, HtmlResponse
import requests
from requests_ntlm import HttpNtlmAuth
class NTLM_Middleware(object):
def process_request(self, request, spider):
url = request.url
usr = getattr(spider, 'http_usr', '')
pwd = getattr(spider, 'http_pass','')
s = requests.session()
response = s.get(url, auth=HttpNtlmAuth(usr,pwd))
return HtmlResponse(url,response.status_code, response.headers.iteritems(), response.content)
settings.py
import logging
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'scrapy intranet'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS=16
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'intranet.NTLM_Middleware.NTLM_Middleware': 200,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':None
}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline',
}
ELASTICSEARCH_SERVER='localhost'
ELASTICSEARCH_PORT=9200
ELASTICSEARCH_USERNAME=''
ELASTICSEARCH_PASSWORD=''
ELASTICSEARCH_INDEX='intranet'
ELASTICSEARCH_TYPE='pages_intranet'
ELASTICSEARCH_UNIQ_KEY='url'
ELASTICSEARCH_LOG_LEVEL=logging.DEBUG
örümcekler/
intranetspider.py
# -*- coding: utf-8 -*-
import scrapy
#from scrapy import log
from scrapy.spiders import CrawlSpider, Rule, Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.http import Response
import requests
import sys
from bs4 import BeautifulSoup
class PageItem(scrapy.Item):
body=scrapy.Field()
title=scrapy.Field()
url=scrapy.Field()
class IntranetspiderSpider(CrawlSpider):
http_usr='DOMAIN\\user'
http_pass='pass'
name = "intranetspider"
protocol='https://'
allowed_domains = ['intranet.mydomain.ca']
start_urls = ['https://intranet.mydomain.ca/']
rules = (Rule(LinkExtractor(),callback="parse_items",follow=True),)
def parse_items(self, response):
self.logger.info('Crawl de la page %s',response.url)
item = PageItem()
soup = BeautifulSoup(response.body)
#remove script tags and javascript from content
[x.extract() for x in soup.findAll('script')]
item['body']=soup.get_text(" ", strip=True)
item['url']=response.url
return item
Sizin için çalışılan şeyi gönderdiğiniz için teşekkürler, bu kimlik doğrulaması için mükemmel çalışır. Ben örümcek ile çalışmasını sağlamak için sadece 'Response' yerine 'HtmlResponse' dönmek zorunda kaldım, aksi takdirde xpath çalışmıyor. – SpaceDog