Skip to content

Commit b4e821b

Browse files
authored
Merge branch 'dev' into add_near_duplicate_filter
2 parents a48d064 + 0c9bba7 commit b4e821b

File tree

6 files changed

+196
-14
lines changed

6 files changed

+196
-14
lines changed

common/errorpageclassifier/clf.gob

-3.23 KB
Binary file not shown.

common/pagetypeclassifier/clf.gob

4.49 KB
Binary file not shown.

common/errorpageclassifier/dataset.txt common/pagetypeclassifier/dataset.txt

+181-2
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ Now viewing: Image Gallery. Enjoy a visual tour of our activities.||nonerror
108108
You're on our FAQ page. Get answers to common questions.||nonerror
109109
Welcome to the Blog section. Engage with our thoughts and insights.||nonerror
110110
This is the Discussion Forum. Join in, ask questions, or help others.||nonerror
111-
You're on the Login page. Enter your credentials to access your account.||nonerror
111+
You're on the Login page. Enter your credentials to access your account.||login
112112
Welcome to the Sign-Up page. Join our community today.||nonerror
113113
This is your User Dashboard. Manage your account and settings here.||nonerror
114114
You've reached the Checkout page. Review your order and proceed to payment.||nonerror
@@ -198,4 +198,183 @@ You've successfully added the item to your cart!||nonerror
198198
Success! Your password has been updated||nonerror
199199
Welcome back! You have successfully logged in||nonerror
200200
Great job! Your profile has been updated||nonerror
201-
Your message was sent successfully. We'll get back to you shortly||nonerror
201+
Your message was sent successfully. We'll get back to you shortly||nonerror
202+
Welcome to the Login page. Please sign in to continue.||login
203+
Please enter your username and password on the login page.||login
204+
You have reached the login page. Access your account by logging in.||login
205+
Login required. Please authenticate to access this page.||login
206+
Welcome back! Please log in to your account.||login
207+
Sign in to your account on this login page.||login
208+
Secure Login: Enter your credentials to proceed.||login
209+
This is the login page. Please enter your email and password.||login
210+
Access denied. Please log in to continue.||login
211+
You're on the login page. Forgot your password? Click here to reset.||login
212+
User Login: Please provide your username and password.||login
213+
Login to your account to access exclusive features.||login
214+
Authentication required. Please log in.||login
215+
Welcome back! Sign in to access your dashboard.||login
216+
Please log in to proceed to the checkout page.||login
217+
Member login: Enter your credentials below.||login
218+
Staff login portal. Please enter your login details.||login
219+
Customer login: Sign in to view your orders.||login
220+
Partner login: Please authenticate to access partner resources.||login
221+
Administrator login page. Enter your admin credentials.||login
222+
Please log in to access your profile settings.||login
223+
Login successful. Redirecting to your account dashboard.||login
224+
Incorrect password. Please try again.||login
225+
Session expired. Please log in again.||login
226+
Welcome to the secure login page. Your privacy is important to us.||login
227+
Access restricted. Please log in to view this content.||login
228+
Please log in to access the members-only area.||login
229+
Sign in with your social media account on the login page.||login
230+
New user? Register here or log in if you already have an account.||login
231+
Log in to participate in the forum discussions.||login
232+
Access your account by logging in here.||login
233+
Please log in to access your personalized dashboard.||login
234+
Enter your login details to continue.||login
235+
Login Page: Securely enter your credentials.||login
236+
Welcome to the user login portal.||login
237+
Sign in to manage your account settings.||login
238+
This is the login screen. Please authenticate.||login
239+
Returning user? Please log in.||login
240+
Please log in to view your messages.||login
241+
Log in to access premium content.||login
242+
Authentication page: Enter your username and password.||login
243+
Please enter your login information to proceed.||login
244+
User authentication required. Please log in.||login
245+
Log in now to unlock exclusive features.||login
246+
Sign in to check your account balance.||login
247+
Welcome back! Please enter your login credentials.||login
248+
Member login area: Access restricted content by logging in.||login
249+
Please sign in to continue to your profile.||login
250+
Staff members, please log in to access internal resources.||login
251+
Enter your email and password to log in.||login
252+
Login required to view this page. Please sign in.||login
253+
Access your profile by logging into your account.||login
254+
Please provide your login credentials to access the system.||login
255+
Log in to track your order status.||login
256+
Welcome to the employee login page.||login
257+
Secure area: Please log in to continue.||login
258+
Please log in to update your preferences.||login
259+
Sign in to access your learning materials.||login
260+
Please authenticate to proceed to the next step.||login
261+
Login Page: Your session has expired, please log in again.||login
262+
Welcome back! Enter your credentials to sign in.||login
263+
Client login: Access your project details here.||login
264+
Agent login portal: Please sign in with your ID.||login
265+
Enter your user ID and password to log in.||login
266+
Log in to view your subscription details.||login
267+
Login Page: Forgot your password? Click here to reset it.||login
268+
Access restricted to authorized users only. Please log in.||login
269+
Vendor login: Manage your listings by logging in.||login
270+
Please log in to access your saved items.||login
271+
Log in to participate in our online courses.||login
272+
Sign in to view your appointment schedule.||login
273+
Welcome to the admin login page.||login
274+
Please enter your credentials to log in securely.||login
275+
Log in to view your recent activities.||login
276+
Authentication needed. Please sign in to proceed.||login
277+
Member login: Keep me signed in checkbox available.||login
278+
Log in with your email or username.||login
279+
Access your account dashboard by logging in.||login
280+
Sign in to post comments on articles.||login
281+
Please log in to access your billing information.||login
282+
Log in to access your personalized recommendations.||login
283+
Please sign in to view your shopping cart.||login
284+
Enter your credentials to log in and start shopping.||login
285+
Welcome to the customer login page. Sign in to continue.||login
286+
Authentication required. Please log in with your secure ID.||login
287+
Log in to access exclusive member discounts.||login
288+
Please log in to view and manage your wishlist.||login
289+
Sign in to access your event tickets and details.||login
290+
Faculty login: Please enter your staff ID and password.||login
291+
Log in to access your investment portfolio.||login
292+
Access your medical records by logging in securely.||login
293+
Please sign in to continue to the payment gateway.||login
294+
Login required to access your order history.||login
295+
Welcome back! Log in to resume your session.||login
296+
Please log in to submit your application.||login
297+
Enter your username and password to log in to the portal.||login
298+
Student login: Access your course materials by signing in.||login
299+
Log in to customize your news feed preferences.||login
300+
Please authenticate to access your secure messages.||login
301+
Sign in to sync your data across devices.||login
302+
Log in to join the live webinar.||login
303+
Please log in to access your reservation details.||login
304+
Welcome to the supplier login page.||login
305+
Log in to access your support tickets.||login
306+
Enter your credentials to log in and view analytics.||login
307+
Please sign in to access developer resources.||login
308+
Login required to view confidential documents.||login
309+
Log in to participate in the survey.||login
310+
Please authenticate to access the admin dashboard.||login
311+
Sign in to view your loyalty points balance.||login
312+
Log in to manage your email subscriptions.||login
313+
Please log in to proceed with the enrollment process.||login
314+
Access your download history by logging in.||login
315+
Welcome back! Please log in to renew your membership.||login
316+
Enter your employee ID to log in to the time tracking system.||login
317+
Log in to update your security settings.||login
318+
Please sign in to access your saved searches.||login
319+
Authentication required for accessing project files.||login
320+
Log in to collaborate with your team members.||login
321+
Please enter your PIN and password to log in.||login
322+
Sign in to access your fitness progress dashboard.||login
323+
Log in to check your test results.||login
324+
Please log in to schedule your appointments.||login
325+
Welcome to the volunteer login page.||login
326+
Log in to view your donation history.||login
327+
Please authenticate to access the control panel.||login
328+
Sign in to review and accept your job offer.||login
329+
Log in to access premium tutorials and guides.||login
330+
Please log in to manage your API keys.||login
331+
Please log in with your email address and password.||login
332+
Enter your username and password to access your account.||login
333+
Sign in to your account using your email and password.||login
334+
Welcome back! Please enter your login credentials.||login
335+
Email address: [input field] Password: [input field]||login
336+
Login to your account. Don't have one? Sign up now.||login
337+
Username: [input field] Password: [input field] Remember me?||login
338+
Forgot your password? Click here to reset it.||login
339+
Please enter your email and password to continue.||login
340+
Secure login portal. Enter credentials below.||login
341+
Access your account by logging in below.||login
342+
Remember me on this device.||login
343+
Login required. Please sign in to proceed.||login
344+
Forgot password? Reset it here.||login
345+
Sign in with your email address and password.||login
346+
Welcome! Please log in to your account.||login
347+
User login: Enter your email and password.||login
348+
Email: [input field] Password: [input field]||login
349+
Please authenticate by entering your login details.||login
350+
Sign in to your account or register for a new one.||login
351+
Login page: Access restricted to authorized users only.||login
352+
Need help logging in? Click here.||login
353+
Enter your credentials to log in.||login
354+
Keep me signed in.||login
355+
Please sign in to access exclusive content.||login
356+
Welcome back! Sign in to your dashboard.||login
357+
Forgot your username or password? Retrieve them here.||login
358+
Log in using your email or username.||login
359+
Authentication required. Please log in.||login
360+
Password recovery: Reset your password now.||login
361+
Log in to manage your account settings.||login
362+
Sign in to continue to checkout.||login
363+
Enter your login information below.||login
364+
Sign in to access your personalized dashboard.||login
365+
Welcome to the member login page.||login
366+
Already have an account? Log in here.||login
367+
Enter email and password to sign in.||login
368+
Sign in to your profile.||login
369+
Member login: Access your account here.||login
370+
Please log in to continue.||login
371+
Enter your password to log in.||login
372+
Sign in to view your messages.||login
373+
Login to your profile to see updates.||login
374+
Log in to your account to access features.||login
375+
Please provide your username and password.||login
376+
Log in to manage your subscriptions.||login
377+
Sign in using your credentials.||login
378+
Access denied. Please log in first.||login
379+
Authentication portal. Enter login details.||login
380+
Need an account? Sign up or log in if you already have one.||login

common/errorpageclassifier/errorpageclassifier.go common/pagetypeclassifier/pagetypeclassifier.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package errorpageclassifier
1+
package pagetypeclassifier
22

33
import (
44
_ "embed"
@@ -10,19 +10,19 @@ import (
1010
//go:embed clf.gob
1111
var classifierData []byte
1212

13-
type ErrorPageClassifier struct {
13+
type PageTypeClassifier struct {
1414
classifier *naive_bayes.NaiveBayesClassifier
1515
}
1616

17-
func New() *ErrorPageClassifier {
17+
func New() *PageTypeClassifier {
1818
classifier, err := naive_bayes.NewClassifierFromFileData(classifierData)
1919
if err != nil {
2020
panic(err)
2121
}
22-
return &ErrorPageClassifier{classifier: classifier}
22+
return &PageTypeClassifier{classifier: classifier}
2323
}
2424

25-
func (n *ErrorPageClassifier) Classify(html string) string {
25+
func (n *PageTypeClassifier) Classify(html string) string {
2626
text := htmlToText(html)
2727
if text == "" {
2828
return "other"

common/errorpageclassifier/errorpageclassifier_test.go common/pagetypeclassifier/pagetypeclassifier_test.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
package errorpageclassifier
1+
package pagetypeclassifier
22

33
import (
44
"testing"
55

66
"github.com/stretchr/testify/assert"
77
)
88

9-
func TestErrorPageClassifier(t *testing.T) {
10-
t.Run("test creation of new ErrorPageClassifier", func(t *testing.T) {
9+
func TestPageTypeClassifier(t *testing.T) {
10+
11+
t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
1112
epc := New()
1213
assert.NotNil(t, epc)
1314
})

runner/runner.go

+6-4
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ import (
3333
asnmap "github.com/projectdiscovery/asnmap/libs"
3434
"github.com/projectdiscovery/fastdialer/fastdialer"
3535
"github.com/projectdiscovery/httpx/common/customextract"
36-
"github.com/projectdiscovery/httpx/common/errorpageclassifier"
3736
"github.com/projectdiscovery/httpx/common/hashes/jarm"
37+
"github.com/projectdiscovery/httpx/common/pagetypeclassifier"
3838
"github.com/projectdiscovery/httpx/static"
3939
"github.com/projectdiscovery/mapcidr/asn"
4040
"github.com/projectdiscovery/networkpolicy"
@@ -86,9 +86,10 @@ type Runner struct {
8686
ratelimiter ratelimit.Limiter
8787
HostErrorsCache gcache.Cache[string, int]
8888
browser *Browser
89-
errorPageClassifier *errorpageclassifier.ErrorPageClassifier
89+
errorPageClassifier *errorpageclassifier.ErrorPageClassifier // Use this for the most specific classification of error pages
90+
pageTypeClassifier *pagetypeclassifier.PageTypeClassifier // Include this for general page classification
9091
pHashClusters []pHashCluster
91-
simHashes gcache.Cache[uint64, struct{}]
92+
simHashes gcache.Cache[uint64, struct{}] // Include simHashes for efficient duplicate detection
9293
httpApiEndpoint *Server
9394
}
9495

@@ -363,6 +364,7 @@ func New(options *Options) (*Runner, error) {
363364

364365
runner.errorPageClassifier = errorpageclassifier.New()
365366
runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build()
367+
runner.pageTypeClassifier = pagetypeclassifier.New()
366368

367369
if options.HttpApiEndpoint != "" {
368370
apiServer := NewServer(options.HttpApiEndpoint, options)
@@ -2273,7 +2275,7 @@ retry:
22732275
ScreenshotBytes: screenshotBytes,
22742276
HeadlessBody: headlessBody,
22752277
KnowledgeBase: map[string]interface{}{
2276-
"PageType": r.errorPageClassifier.Classify(respData),
2278+
"PageType": r.pageTypeClassifier.Classify(respData),
22772279
"pHash": pHash,
22782280
},
22792281
TechnologyDetails: technologyDetails,

0 commit comments

Comments
 (0)