@inproceedings{5b8cdb55faa94d0fa6dbb2ca1898d208,
title = "Researcher homepage classification using unlabeled data",
abstract = "A classifier that determines if a webpage is relevant to a specified set of topics comprises a key component for focused crawling. Can a classifier that is tuned to perform well on training datasets continue to filter out irrelevant pages in the face of changed content on theWeb? We investigate this question in the context of researcher homepage crawling. We show experimentally that classifiers trained on existing datasets for homepage identification underperform while classifying {"}irrelevant{"} pages on current-day academic websites. As an alternative to obtaining datasets to retrain the classifier for the new content, we propose to use effectively unlimited amounts of unlabeled data readily available from these websites in a co-training scenario. To this end, we design novel URL-based features and use them in conjunction with content-based features as complementary views of the data to obtain remarkable improvements in accurately identifying homepages from the current-day university websites. In addition, we propose a novel technique for {"}learning a conforming pair of classifiers{"} using mini-batch gradient descent. Our algorithm seeks to minimize a loss (objective) function quantifying the difference in predictions from the two views afforded by co-training. We demonstrate that tuning the classifiers so that they make {"}similar{"} predictions on unlabeled data strongly corresponds to the effect achieved by co-training algorithms. We argue that this loss formulation provides insight into understanding the co-training process and can be used even in absence of a validation set. Copyright is held by the International World Wide Web Conference Committee (IW3C2).",
author = "{Das G}, Sujatha and Cornelia Caragea and Prasenjit Mitra and Giles, {C. Lee}",
year = "2013",
language = "English (US)",
isbn = "9781450320351",
series = "WWW 2013 - Proceedings of the 22nd International Conference on World Wide Web",
pages = "471--481",
booktitle = "WWW 2013 - Proceedings of the 22nd International Conference on World Wide Web",
note = "22nd International Conference on World Wide Web, WWW 2013 ; Conference date: 13-05-2013 Through 17-05-2013",
}