Skip to content

Commit

Permalink
introduce page type classifier (#1924)
Browse files Browse the repository at this point in the history
* introduce page type classifier

* add more data
  • Loading branch information
dogancanbakir authored Oct 20, 2024
1 parent 7cfd714 commit 0c9bba7
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 26 deletions.
Binary file removed common/errorpageclassifier/clf.gob
Binary file not shown.
Binary file added common/pagetypeclassifier/clf.gob
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ Now viewing: Image Gallery. Enjoy a visual tour of our activities.||nonerror
You're on our FAQ page. Get answers to common questions.||nonerror
Welcome to the Blog section. Engage with our thoughts and insights.||nonerror
This is the Discussion Forum. Join in, ask questions, or help others.||nonerror
You're on the Login page. Enter your credentials to access your account.||nonerror
You're on the Login page. Enter your credentials to access your account.||login
Welcome to the Sign-Up page. Join our community today.||nonerror
This is your User Dashboard. Manage your account and settings here.||nonerror
You've reached the Checkout page. Review your order and proceed to payment.||nonerror
Expand Down Expand Up @@ -198,4 +198,183 @@ You've successfully added the item to your cart!||nonerror
Success! Your password has been updated||nonerror
Welcome back! You have successfully logged in||nonerror
Great job! Your profile has been updated||nonerror
Your message was sent successfully. We'll get back to you shortly||nonerror
Your message was sent successfully. We'll get back to you shortly||nonerror
Welcome to the Login page. Please sign in to continue.||login
Please enter your username and password on the login page.||login
You have reached the login page. Access your account by logging in.||login
Login required. Please authenticate to access this page.||login
Welcome back! Please log in to your account.||login
Sign in to your account on this login page.||login
Secure Login: Enter your credentials to proceed.||login
This is the login page. Please enter your email and password.||login
Access denied. Please log in to continue.||login
You're on the login page. Forgot your password? Click here to reset.||login
User Login: Please provide your username and password.||login
Login to your account to access exclusive features.||login
Authentication required. Please log in.||login
Welcome back! Sign in to access your dashboard.||login
Please log in to proceed to the checkout page.||login
Member login: Enter your credentials below.||login
Staff login portal. Please enter your login details.||login
Customer login: Sign in to view your orders.||login
Partner login: Please authenticate to access partner resources.||login
Administrator login page. Enter your admin credentials.||login
Please log in to access your profile settings.||login
Login successful. Redirecting to your account dashboard.||login
Incorrect password. Please try again.||login
Session expired. Please log in again.||login
Welcome to the secure login page. Your privacy is important to us.||login
Access restricted. Please log in to view this content.||login
Please log in to access the members-only area.||login
Sign in with your social media account on the login page.||login
New user? Register here or log in if you already have an account.||login
Log in to participate in the forum discussions.||login
Access your account by logging in here.||login
Please log in to access your personalized dashboard.||login
Enter your login details to continue.||login
Login Page: Securely enter your credentials.||login
Welcome to the user login portal.||login
Sign in to manage your account settings.||login
This is the login screen. Please authenticate.||login
Returning user? Please log in.||login
Please log in to view your messages.||login
Log in to access premium content.||login
Authentication page: Enter your username and password.||login
Please enter your login information to proceed.||login
User authentication required. Please log in.||login
Log in now to unlock exclusive features.||login
Sign in to check your account balance.||login
Welcome back! Please enter your login credentials.||login
Member login area: Access restricted content by logging in.||login
Please sign in to continue to your profile.||login
Staff members, please log in to access internal resources.||login
Enter your email and password to log in.||login
Login required to view this page. Please sign in.||login
Access your profile by logging into your account.||login
Please provide your login credentials to access the system.||login
Log in to track your order status.||login
Welcome to the employee login page.||login
Secure area: Please log in to continue.||login
Please log in to update your preferences.||login
Sign in to access your learning materials.||login
Please authenticate to proceed to the next step.||login
Login Page: Your session has expired, please log in again.||login
Welcome back! Enter your credentials to sign in.||login
Client login: Access your project details here.||login
Agent login portal: Please sign in with your ID.||login
Enter your user ID and password to log in.||login
Log in to view your subscription details.||login
Login Page: Forgot your password? Click here to reset it.||login
Access restricted to authorized users only. Please log in.||login
Vendor login: Manage your listings by logging in.||login
Please log in to access your saved items.||login
Log in to participate in our online courses.||login
Sign in to view your appointment schedule.||login
Welcome to the admin login page.||login
Please enter your credentials to log in securely.||login
Log in to view your recent activities.||login
Authentication needed. Please sign in to proceed.||login
Member login: Keep me signed in checkbox available.||login
Log in with your email or username.||login
Access your account dashboard by logging in.||login
Sign in to post comments on articles.||login
Please log in to access your billing information.||login
Log in to access your personalized recommendations.||login
Please sign in to view your shopping cart.||login
Enter your credentials to log in and start shopping.||login
Welcome to the customer login page. Sign in to continue.||login
Authentication required. Please log in with your secure ID.||login
Log in to access exclusive member discounts.||login
Please log in to view and manage your wishlist.||login
Sign in to access your event tickets and details.||login
Faculty login: Please enter your staff ID and password.||login
Log in to access your investment portfolio.||login
Access your medical records by logging in securely.||login
Please sign in to continue to the payment gateway.||login
Login required to access your order history.||login
Welcome back! Log in to resume your session.||login
Please log in to submit your application.||login
Enter your username and password to log in to the portal.||login
Student login: Access your course materials by signing in.||login
Log in to customize your news feed preferences.||login
Please authenticate to access your secure messages.||login
Sign in to sync your data across devices.||login
Log in to join the live webinar.||login
Please log in to access your reservation details.||login
Welcome to the supplier login page.||login
Log in to access your support tickets.||login
Enter your credentials to log in and view analytics.||login
Please sign in to access developer resources.||login
Login required to view confidential documents.||login
Log in to participate in the survey.||login
Please authenticate to access the admin dashboard.||login
Sign in to view your loyalty points balance.||login
Log in to manage your email subscriptions.||login
Please log in to proceed with the enrollment process.||login
Access your download history by logging in.||login
Welcome back! Please log in to renew your membership.||login
Enter your employee ID to log in to the time tracking system.||login
Log in to update your security settings.||login
Please sign in to access your saved searches.||login
Authentication required for accessing project files.||login
Log in to collaborate with your team members.||login
Please enter your PIN and password to log in.||login
Sign in to access your fitness progress dashboard.||login
Log in to check your test results.||login
Please log in to schedule your appointments.||login
Welcome to the volunteer login page.||login
Log in to view your donation history.||login
Please authenticate to access the control panel.||login
Sign in to review and accept your job offer.||login
Log in to access premium tutorials and guides.||login
Please log in to manage your API keys.||login
Please log in with your email address and password.||login
Enter your username and password to access your account.||login
Sign in to your account using your email and password.||login
Welcome back! Please enter your login credentials.||login
Email address: [input field] Password: [input field]||login
Login to your account. Don't have one? Sign up now.||login
Username: [input field] Password: [input field] Remember me?||login
Forgot your password? Click here to reset it.||login
Please enter your email and password to continue.||login
Secure login portal. Enter credentials below.||login
Access your account by logging in below.||login
Remember me on this device.||login
Login required. Please sign in to proceed.||login
Forgot password? Reset it here.||login
Sign in with your email address and password.||login
Welcome! Please log in to your account.||login
User login: Enter your email and password.||login
Email: [input field] Password: [input field]||login
Please authenticate by entering your login details.||login
Sign in to your account or register for a new one.||login
Login page: Access restricted to authorized users only.||login
Need help logging in? Click here.||login
Enter your credentials to log in.||login
Keep me signed in.||login
Please sign in to access exclusive content.||login
Welcome back! Sign in to your dashboard.||login
Forgot your username or password? Retrieve them here.||login
Log in using your email or username.||login
Authentication required. Please log in.||login
Password recovery: Reset your password now.||login
Log in to manage your account settings.||login
Sign in to continue to checkout.||login
Enter your login information below.||login
Sign in to access your personalized dashboard.||login
Welcome to the member login page.||login
Already have an account? Log in here.||login
Enter email and password to sign in.||login
Sign in to your profile.||login
Member login: Access your account here.||login
Please log in to continue.||login
Enter your password to log in.||login
Sign in to view your messages.||login
Login to your profile to see updates.||login
Log in to your account to access features.||login
Please provide your username and password.||login
Log in to manage your subscriptions.||login
Sign in using your credentials.||login
Access denied. Please log in first.||login
Authentication portal. Enter login details.||login
Need an account? Sign up or log in if you already have one.||login
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package errorpageclassifier
package pagetypeclassifier

import (
_ "embed"
Expand All @@ -10,19 +10,19 @@ import (
//go:embed clf.gob
var classifierData []byte

type ErrorPageClassifier struct {
type PageTypeClassifier struct {
classifier *naive_bayes.NaiveBayesClassifier
}

func New() *ErrorPageClassifier {
func New() *PageTypeClassifier {
classifier, err := naive_bayes.NewClassifierFromFileData(classifierData)
if err != nil {
panic(err)
}
return &ErrorPageClassifier{classifier: classifier}
return &PageTypeClassifier{classifier: classifier}
}

func (n *ErrorPageClassifier) Classify(html string) string {
func (n *PageTypeClassifier) Classify(html string) string {
text := htmlToText(html)
if text == "" {
return "other"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package errorpageclassifier
package pagetypeclassifier

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestErrorPageClassifier(t *testing.T) {
t.Run("test creation of new ErrorPageClassifier", func(t *testing.T) {
func TestPageTypeClassifier(t *testing.T) {

t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
epc := New()
assert.NotNil(t, epc)
})
Expand Down
32 changes: 16 additions & 16 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ import (
asnmap "github.com/projectdiscovery/asnmap/libs"
"github.com/projectdiscovery/fastdialer/fastdialer"
"github.com/projectdiscovery/httpx/common/customextract"
"github.com/projectdiscovery/httpx/common/errorpageclassifier"
"github.com/projectdiscovery/httpx/common/hashes/jarm"
"github.com/projectdiscovery/httpx/common/pagetypeclassifier"
"github.com/projectdiscovery/httpx/static"
"github.com/projectdiscovery/mapcidr/asn"
"github.com/projectdiscovery/networkpolicy"
Expand Down Expand Up @@ -74,19 +74,19 @@ import (

// Runner is a client for running the enumeration process.
type Runner struct {
options *Options
hp *httpx.HTTPX
wappalyzer *wappalyzer.Wappalyze
scanopts ScanOptions
hm *hybrid.HybridMap
excludeCdn bool
stats clistats.StatisticsClient
ratelimiter ratelimit.Limiter
HostErrorsCache gcache.Cache[string, int]
browser *Browser
errorPageClassifier *errorpageclassifier.ErrorPageClassifier
pHashClusters []pHashCluster
httpApiEndpoint *Server
options *Options
hp *httpx.HTTPX
wappalyzer *wappalyzer.Wappalyze
scanopts ScanOptions
hm *hybrid.HybridMap
excludeCdn bool
stats clistats.StatisticsClient
ratelimiter ratelimit.Limiter
HostErrorsCache gcache.Cache[string, int]
browser *Browser
pageTypeClassifier *pagetypeclassifier.PageTypeClassifier
pHashClusters []pHashCluster
httpApiEndpoint *Server
}

func (r *Runner) HTTPX() *httpx.HTTPX {
Expand Down Expand Up @@ -358,7 +358,7 @@ func New(options *Options) (*Runner, error) {
runner.HostErrorsCache = gc
}

runner.errorPageClassifier = errorpageclassifier.New()
runner.pageTypeClassifier = pagetypeclassifier.New()

if options.HttpApiEndpoint != "" {
apiServer := NewServer(options.HttpApiEndpoint, options)
Expand Down Expand Up @@ -2243,7 +2243,7 @@ retry:
ScreenshotBytes: screenshotBytes,
HeadlessBody: headlessBody,
KnowledgeBase: map[string]interface{}{
"PageType": r.errorPageClassifier.Classify(respData),
"PageType": r.pageTypeClassifier.Classify(respData),
"pHash": pHash,
},
TechnologyDetails: technologyDetails,
Expand Down

0 comments on commit 0c9bba7

Please sign in to comment.