Web kazıma kelime listeleri

Şu anda bir kuyruk dosyasında sakladığım URL'lerin bir listesi üzerinden çalışan bir web tarayıcısı geliştiriyorum, bu URL sayfalarındaki tüm sözcükleri bir sonraki bağlantıya geçmeden önce bu sözcüklerden kazımak için Örümcek'e ihtiyacım var kuyruğuna göre, web kazıyıcı benim kelimenin orada olmadığından emin olmak için benim web.track'ımla karşılaştırır ve eğer eklemeden önce listede değilse, vs.Web kazıma kelime listeleri

benim spider.py içinde get_keywords ile böyle bir şey denemiş ama ben yine de burada bütün gün kodlama ama oldum ben basit bir şey eksik olabilir hiçbir şey yapmıyor

Spider.py benim kodudur

from Gen_info import * 


class Spider: 

    project_name = '' 
    queue_file = '' 
    crawled_file = '' 
    keyword_file = '' 
    queue = set() 
    crawled = set() 

    def __init__(self, project_name): 
     Spider.project_name = project_name 
     Spider.queue_file = Spider.project_name + '/Chrome_Hist.csv' 
     Spider.crawled_file = Spider.project_name + '/CrawledUrls.txt' 
     self.boot() 
     #self.crawl_page('First spider', Spider.queue) 

    # Creates directory and files for project on first run and starts the spider 
    @staticmethod 
    def boot(): 
     create_project_dir(Spider.project_name) 
     create_files(Spider.project_name) 
     Spider.queue = file_to_set(Spider.queue_file) 
     Spider.crawled = file_to_set(Spider.crawled_file) 

    # Updates user display, fills queue and updates files 
    @staticmethod 
    def crawl_page(thread_name, page_url): 
     if page_url not in Spider.crawled: 
      print(thread_name + ' now crawling ' + page_url) 
      print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) 
      Spider.queue.remove(page_url) 
      Spider.crawled.add(page_url) 
      Spider.update_files() 


    @staticmethod 
    def update_files(): 
     set_to_file(Spider.queue, Spider.queue_file) 
     set_to_file(Spider.crawled, Spider.crawled_file) 



    @staticmethod 
    def get_keywords(Page_words): 
     common = open("Common_words.txt").read().split('\n') 
     word_dict = {} 
     word_list = Page_words.lower().split() 
     for word in word_list: 
      if word not in common and word.isalnum(): 
       if word not in word_dict: 
        word_dict[word] = 1 
       if word in word_dict: 
        word_dict[word] += 1

main.py

import threading 
from Queue import Queue 
from Spider import Spider 
from Gen_info import * 
import urllib2 
from bs4 import BeautifulSoup 
from shutil import copyfile 
import os 


PROJECT_NAME = 'History Forensics' 
QUEUE_FILE = PROJECT_NAME + '/Chrome_Hist.csv' 
CRAWLED_FILE = PROJECT_NAME + '/CrawledUrls.txt' 
NUMBER_OF_THREADS = 2 
Queue = Queue() 
Spider(PROJECT_NAME) 
keywords = '' 



src = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv' 
dst = PROJECT_NAME 

path = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv' 

# Create worker threads (will die when main exits) 
def create_workers(): 
    for _ in range(NUMBER_OF_THREADS): 
     t = threading.Thread(target=work) 
     t.daemon = True 
     t.start() 


# Do the next job in the queue 
def work(): 
    while True: 
     url = Queue.get() 
     Spider.crawl_page(threading.current_thread().name, url) 
     Queue.task_done() 


# Each queued link is a new job 
def create_jobs(): 
    for link in file_to_set(QUEUE_FILE): 
     Queue.put(link) 
    Queue.join() 
    crawl() 


# Check if there are items in the queue, if so crawl them 
def crawl(): 
    queued_links = file_to_set(QUEUE_FILE) 
    if len(queued_links) > 0: 
     print(str(len(queued_links)) + ' links in the queue') 
     create_jobs() 

def get_keywords(): 



     common_words = open('File_Storage/common.txt', 'r').readlines() 
     keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n') 
     f = open(PROJECT_NAME + '/keywords.txt', 'a') 
     urls = file_to_set(QUEUE_FILE) 
     Hist_queue = urls 
     for i in Hist_queue: 
      html_content = urllib2.urlopen(i).read() 
      soup = BeautifulSoup(html_content) 
      for script in soup(["script", "style"]): 
       script.extract() 
      text = soup.get_text() 
      lines = (line.strip() for line in text.splitlines()) 
      chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) 
      text = '\n'.join(chunk for chunk in chunks if chunk) 
      (text.encode('utf-8')) 
      visible_text = soup.getText() 
      words = visible_text.split(' ') 
      for word in words: 
       if word not in common_words and word not in keywords and word.isalnum(): 
        f.write(word + '\n') 
        keywords.append(word) 
       else: 
         continue 






#copyfile(src, dst) 
# 

# os.remove(path) 
create_workers() 
get_keywords() 
crawl()

yangın uzakta çalışır veya başka bir kodu İsterseniz avans herkes

kaynak

2016-03-20 Lewis Collins

yılında

sayesinde görmeye gerekebilir konusunda tüm sorular url ile url'yi işlemek, basit. İplik kodunuzu alın ve sırayla çalıştırın. Gereksinimini yanlış anlamam sürece. Ya da, bir seferde 1 iş parçacığına sınırlamak için bir işaretiniz var (ileti sayısı) –

web tarayıcısının kendisi iyi ve örneğin 5 url'yi sıraya koydum ve sonra URL'yi ziyaret ettikten sonra örümceklere onu kaldırın ve crawled.txt içine koyun ama bir şekilde web sayfası ayrıştırmak ve sonraki url'ye geçmeden önce url'den kelimeleri almak için almam gerekiyor. –

Bu durumda herhangi biri sıkışırsa, çözümümle düzenlerim –

def get_keywords(): 



     common_words = open('File_Storage/common.txt', 'r').readlines() 
     keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n') 
     f = open(PROJECT_NAME + '/keywords.txt', 'a') 
     urls = file_to_set(QUEUE_FILE) 
     Hist_queue = urls 
     for i in Hist_queue: 
      html_content = urllib2.urlopen(i).read() 
      soup = BeautifulSoup(html_content) 
      for script in soup(["script", "style"]): 
       script.extract() 
      text = soup.get_text() 
      lines = (line.strip() for line in text.splitlines()) 
      chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) 
      text = '\n'.join(chunk for chunk in chunks if chunk) 
      (text.encode('utf-8')) 
      visible_text = soup.getText() 
      words = visible_text.split(' ') 
      for word in words: 
       if word not in common_words and word not in keywords and word.isalnum(): 
        f.write(word + '\n') 
        keywords.append(word) 
       else: 
         continue

kaynak

2016-03-27 11:47:20

cevap

İlgili konular