xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 3761b2f4207ccac01d028608b724f807ca142427)
1 /*
2  * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <stdint.h>
22 
23 #include <KernelExport.h>
24 #include <Select.h>
25 
26 #include <net_buffer.h>
27 #include <net_datalink.h>
28 #include <net_stat.h>
29 #include <NetBufferUtilities.h>
30 #include <NetUtilities.h>
31 
32 #include <lock.h>
33 #include <tracing.h>
34 #include <util/AutoLock.h>
35 #include <util/list.h>
36 
37 #include "EndpointManager.h"
38 
39 
40 // References:
41 //	- RFC 793 - Transmission Control Protocol
42 //	- RFC 813 - Window and Acknowledgement Strategy in TCP
43 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44 //
45 // Things incomplete in this implementation:
46 //	- TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS
47 //	- Congestion Control, RFC 5681
48 //	- Limited Transit, RFC 3042
49 //	- SACK, Selective Acknowledgment; RFC 2018, RFC 2883, RFC 6675
50 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
51 //
52 // Things this implementation currently doesn't implement:
53 //	- Explicit Congestion Notification (ECN), RFC 3168
54 //	- SYN-Cache
55 //	- Forward RTO-Recovery, RFC 4138
56 //	- Time-Wait hash instead of keeping sockets alive
57 
58 #define PrintAddress(address) \
59 	AddressString(Domain(), address, true).Data()
60 
61 //#define TRACE_TCP
62 //#define PROBE_TCP
63 
64 #ifdef TRACE_TCP
65 // the space before ', ##args' is important in order for this to work with cpp 2.95
66 #	define TRACE(format, args...)	dprintf("%" B_PRId32 ": TCP [%" \
67 		B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
68 		system_time(), this, name_for_state(fState) , ##args)
69 #else
70 #	define TRACE(args...)			do { } while (0)
71 #endif
72 
73 #ifdef PROBE_TCP
74 #	define PROBE(buffer, window) \
75 	dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
76 		" suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
77 		B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
78 		B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
79 		system_time(), PrintAddress(buffer->source), \
80 		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
81 		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
82 		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
83 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
84 #else
85 #	define PROBE(buffer, window)	do { } while (0)
86 #endif
87 
88 #if TCP_TRACING
89 namespace TCPTracing {
90 
91 class Receive : public AbstractTraceEntry {
92 public:
93 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
94 			net_buffer* buffer)
95 		:
96 		fEndpoint(endpoint),
97 		fBuffer(buffer),
98 		fBufferSize(buffer->size),
99 		fSequence(segment.sequence),
100 		fAcknowledge(segment.acknowledge),
101 		fWindow(window),
102 		fState(endpoint->State()),
103 		fFlags(segment.flags)
104 	{
105 		Initialized();
106 	}
107 
108 	virtual void AddDump(TraceOutput& out)
109 	{
110 		out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
111 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
112 			", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
113 			fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
114 	}
115 
116 protected:
117 	TCPEndpoint*	fEndpoint;
118 	net_buffer*		fBuffer;
119 	uint32			fBufferSize;
120 	uint32			fSequence;
121 	uint32			fAcknowledge;
122 	uint32			fWindow;
123 	tcp_state		fState;
124 	uint8			fFlags;
125 };
126 
127 class Send : public AbstractTraceEntry {
128 public:
129 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
130 			tcp_sequence firstSequence, tcp_sequence lastSequence)
131 		:
132 		fEndpoint(endpoint),
133 		fBuffer(buffer),
134 		fBufferSize(buffer->size),
135 		fSequence(segment.sequence),
136 		fAcknowledge(segment.acknowledge),
137 		fFirstSequence(firstSequence.Number()),
138 		fLastSequence(lastSequence.Number()),
139 		fState(endpoint->State()),
140 		fFlags(segment.flags)
141 	{
142 		Initialized();
143 	}
144 
145 	virtual void AddDump(TraceOutput& out)
146 	{
147 		out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
148 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
149 			", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
150 			name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
151 			fAcknowledge, fFirstSequence, fLastSequence);
152 	}
153 
154 protected:
155 	TCPEndpoint*	fEndpoint;
156 	net_buffer*		fBuffer;
157 	uint32			fBufferSize;
158 	uint32			fSequence;
159 	uint32			fAcknowledge;
160 	uint32			fFirstSequence;
161 	uint32			fLastSequence;
162 	tcp_state		fState;
163 	uint8			fFlags;
164 };
165 
166 class State : public AbstractTraceEntry {
167 public:
168 	State(TCPEndpoint* endpoint)
169 		:
170 		fEndpoint(endpoint),
171 		fState(endpoint->State())
172 	{
173 		Initialized();
174 	}
175 
176 	virtual void AddDump(TraceOutput& out)
177 	{
178 		out.Print("tcp:%p (%12s) state change", fEndpoint,
179 			name_for_state(fState));
180 	}
181 
182 protected:
183 	TCPEndpoint*	fEndpoint;
184 	tcp_state		fState;
185 };
186 
187 class Spawn : public AbstractTraceEntry {
188 public:
189 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
190 		:
191 		fListeningEndpoint(listeningEndpoint),
192 		fSpawnedEndpoint(spawnedEndpoint)
193 	{
194 		Initialized();
195 	}
196 
197 	virtual void AddDump(TraceOutput& out)
198 	{
199 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
200 	}
201 
202 protected:
203 	TCPEndpoint*	fListeningEndpoint;
204 	TCPEndpoint*	fSpawnedEndpoint;
205 };
206 
207 class Error : public AbstractTraceEntry {
208 public:
209 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
210 		:
211 		fEndpoint(endpoint),
212 		fLine(line),
213 		fError(error),
214 		fState(endpoint->State())
215 	{
216 		Initialized();
217 	}
218 
219 	virtual void AddDump(TraceOutput& out)
220 	{
221 		out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
222 			name_for_state(fState), fLine, fError);
223 	}
224 
225 protected:
226 	TCPEndpoint*	fEndpoint;
227 	int32			fLine;
228 	const char*		fError;
229 	tcp_state		fState;
230 };
231 
232 class TimerSet : public AbstractTraceEntry {
233 public:
234 	TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
235 		:
236 		fEndpoint(endpoint),
237 		fWhich(which),
238 		fTimeout(timeout),
239 		fState(endpoint->State())
240 	{
241 		Initialized();
242 	}
243 
244 	virtual void AddDump(TraceOutput& out)
245 	{
246 		out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
247 			name_for_state(fState), fWhich, fTimeout);
248 	}
249 
250 protected:
251 	TCPEndpoint*	fEndpoint;
252 	const char*		fWhich;
253 	bigtime_t		fTimeout;
254 	tcp_state		fState;
255 };
256 
257 class TimerTriggered : public AbstractTraceEntry {
258 public:
259 	TimerTriggered(TCPEndpoint* endpoint, const char* which)
260 		:
261 		fEndpoint(endpoint),
262 		fWhich(which),
263 		fState(endpoint->State())
264 	{
265 		Initialized();
266 	}
267 
268 	virtual void AddDump(TraceOutput& out)
269 	{
270 		out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
271 			name_for_state(fState), fWhich);
272 	}
273 
274 protected:
275 	TCPEndpoint*	fEndpoint;
276 	const char*		fWhich;
277 	tcp_state		fState;
278 };
279 
280 class APICall : public AbstractTraceEntry {
281 public:
282 	APICall(TCPEndpoint* endpoint, const char* which)
283 		:
284 		fEndpoint(endpoint),
285 		fWhich(which),
286 		fState(endpoint->State())
287 	{
288 		Initialized();
289 	}
290 
291 	virtual void AddDump(TraceOutput& out)
292 	{
293 		out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
294 			name_for_state(fState), fWhich);
295 	}
296 
297 protected:
298 	TCPEndpoint*	fEndpoint;
299 	const char*		fWhich;
300 	tcp_state		fState;
301 };
302 
303 }	// namespace TCPTracing
304 
305 #	define T(x)	new(std::nothrow) TCPTracing::x
306 #else
307 #	define T(x)
308 #endif	// TCP_TRACING
309 
310 
311 // constants for the fFlags field
312 enum {
313 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
314 	FLAG_OPTION_TIMESTAMP		= 0x02,
315 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
316 	//       That is, what is expected from accept() after a shutdown()
317 	//       is performed on a listen()ing socket.
318 	FLAG_NO_RECEIVE				= 0x04,
319 	FLAG_CLOSED					= 0x08,
320 	FLAG_DELETE_ON_CLOSE		= 0x10,
321 	FLAG_LOCAL					= 0x20,
322 	FLAG_RECOVERY				= 0x40,
323 	FLAG_OPTION_SACK_PERMITTED	= 0x80,
324 };
325 
326 
327 static const int kTimestampFactor = 1000;
328 	// conversion factor between usec system time and msec tcp time
329 
330 
331 static inline bigtime_t
332 absolute_timeout(bigtime_t timeout)
333 {
334 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
335 		return timeout;
336 
337 	return timeout + system_time();
338 }
339 
340 
341 static inline status_t
342 posix_error(status_t error)
343 {
344 	if (error == B_TIMED_OUT)
345 		return B_WOULD_BLOCK;
346 
347 	return error;
348 }
349 
350 
351 static inline bool
352 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
353 	uint32 receiveWindow)
354 {
355 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
356 }
357 
358 
359 static inline bool
360 segment_in_sequence(const tcp_segment_header& segment, int size,
361 	const tcp_sequence& receiveNext, uint32 receiveWindow)
362 {
363 	tcp_sequence sequence(segment.sequence);
364 	if (size == 0) {
365 		if (receiveWindow == 0)
366 			return sequence == receiveNext;
367 		return in_window(sequence, receiveNext, receiveWindow);
368 	} else {
369 		if (receiveWindow == 0)
370 			return false;
371 		return in_window(sequence, receiveNext, receiveWindow)
372 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
373 	}
374 }
375 
376 
377 static inline bool
378 is_writable(tcp_state state)
379 {
380 	return state == ESTABLISHED || state == FINISH_RECEIVED;
381 }
382 
383 
384 static inline bool
385 is_establishing(tcp_state state)
386 {
387 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
388 }
389 
390 
391 static inline uint32 tcp_now()
392 {
393 	return system_time() / kTimestampFactor;
394 }
395 
396 
397 static inline uint32 tcp_diff_timestamp(uint32 base)
398 {
399 	uint32 now = tcp_now();
400 
401 	if (now > base)
402 		return now - base;
403 
404 	return now + UINT_MAX - base;
405 }
406 
407 
408 static inline bool
409 state_needs_finish(int32 state)
410 {
411 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
412 		|| state == FINISH_SENT || state == CLOSING;
413 }
414 
415 
416 //	#pragma mark -
417 
418 
419 TCPEndpoint::TCPEndpoint(net_socket* socket)
420 	:
421 	ProtocolSocket(socket),
422 	fManager(NULL),
423 	fOptions(0),
424 	fSendWindowShift(0),
425 	fReceiveWindowShift(0),
426 	fSendUnacknowledged(0),
427 	fSendNext(0),
428 	fSendMax(0),
429 	fSendUrgentOffset(0),
430 	fSendWindow(0),
431 	fSendMaxWindow(0),
432 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
433 	fSendMaxSegments(0),
434 	fSendQueue(socket->send.buffer_size),
435 	fInitialSendSequence(0),
436 	fPreviousHighestAcknowledge(0),
437 	fDuplicateAcknowledgeCount(0),
438 	fPreviousFlightSize(0),
439 	fRecover(0),
440 	fRoute(NULL),
441 	fReceiveNext(0),
442 	fReceiveMaxAdvertised(0),
443 	fReceiveWindow(socket->receive.buffer_size),
444 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
445 	fReceiveQueue(socket->receive.buffer_size),
446 	fSmoothedRoundTripTime(0),
447 	fRoundTripVariation(0),
448 	fSendTime(0),
449 	fRoundTripStartSequence(0),
450 	fRetransmitTimeout(TCP_INITIAL_RTT),
451 	fReceivedTimestamp(0),
452 	fCongestionWindow(0),
453 	fSlowStartThreshold(0),
454 	fState(CLOSED),
455 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP | FLAG_OPTION_SACK_PERMITTED)
456 {
457 	// TODO: to be replaced with a real read/write locking strategy!
458 	mutex_init(&fLock, "tcp lock");
459 
460 	fReceiveCondition.Init(this, "tcp receive");
461 	fSendCondition.Init(this, "tcp send");
462 
463 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
464 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
465 		this);
466 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
467 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
468 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
469 		this);
470 
471 	T(APICall(this, "constructor"));
472 }
473 
474 
475 TCPEndpoint::~TCPEndpoint()
476 {
477 	mutex_lock(&fLock);
478 
479 	T(APICall(this, "destructor"));
480 
481 	_CancelConnectionTimers();
482 	gStackModule->cancel_timer(&fTimeWaitTimer);
483 	T(TimerSet(this, "time-wait", -1));
484 
485 	if (fManager != NULL) {
486 		fManager->Unbind(this);
487 		put_endpoint_manager(fManager);
488 	}
489 
490 	mutex_destroy(&fLock);
491 
492 	// we need to wait for all timers to return
493 	gStackModule->wait_for_timer(&fRetransmitTimer);
494 	gStackModule->wait_for_timer(&fPersistTimer);
495 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
496 	gStackModule->wait_for_timer(&fTimeWaitTimer);
497 
498 	gDatalinkModule->put_route(Domain(), fRoute);
499 }
500 
501 
502 status_t
503 TCPEndpoint::InitCheck() const
504 {
505 	return B_OK;
506 }
507 
508 
509 //	#pragma mark - protocol API
510 
511 
512 status_t
513 TCPEndpoint::Open()
514 {
515 	TRACE("Open()");
516 	T(APICall(this, "open"));
517 
518 	status_t status = ProtocolSocket::Open();
519 	if (status < B_OK)
520 		return status;
521 
522 	fManager = get_endpoint_manager(Domain());
523 	if (fManager == NULL)
524 		return EAFNOSUPPORT;
525 
526 	return B_OK;
527 }
528 
529 
530 status_t
531 TCPEndpoint::Close()
532 {
533 	MutexLocker locker(fLock);
534 
535 	TRACE("Close()");
536 	T(APICall(this, "close"));
537 
538 	if (fState == LISTEN)
539 		delete_sem(fAcceptSemaphore);
540 
541 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
542 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
543 		fState = CLOSED;
544 		T(State(this));
545 		return B_OK;
546 	}
547 
548 	// handle linger with zero timeout
549 	if ((socket->options & SO_LINGER) != 0 && socket->linger == 0) {
550 		fState = CLOSED;
551 		T(State(this));
552 		return _SendQueued(true);
553 	}
554 
555 	status_t status = _Disconnect(true);
556 	if (status != B_OK)
557 		return status;
558 
559 	if ((socket->options & SO_LINGER) != 0) {
560 		TRACE("Close(): Lingering for %i secs", socket->linger);
561 
562 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
563 
564 		while (fSendQueue.Used() > 0) {
565 			status = _WaitForCondition(fSendCondition, locker, maximum);
566 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
567 				break;
568 			else if (status < B_OK)
569 				return status;
570 		}
571 
572 		TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
573 			" bytes.", fSendQueue.Used());
574 	}
575 	return B_OK;
576 }
577 
578 
579 void
580 TCPEndpoint::Free()
581 {
582 	MutexLocker _(fLock);
583 
584 	TRACE("Free()");
585 	T(APICall(this, "free"));
586 
587 	if (fState <= SYNCHRONIZE_SENT)
588 		return;
589 
590 	// we are only interested in the timer, not in changing state
591 	_EnterTimeWait();
592 
593 	fFlags |= FLAG_CLOSED;
594 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
595 		// we'll be freed later when the 2MSL timer expires
596 		gSocketModule->acquire_socket(socket);
597 	}
598 }
599 
600 
601 /*!	Creates and sends a synchronize packet to /a address, and then waits
602 	until the connection has been established or refused.
603 */
604 status_t
605 TCPEndpoint::Connect(const sockaddr* address)
606 {
607 	if (!AddressModule()->is_same_family(address))
608 		return EAFNOSUPPORT;
609 
610 	MutexLocker locker(fLock);
611 
612 	TRACE("Connect() on address %s", PrintAddress(address));
613 	T(APICall(this, "connect"));
614 
615 	if (gStackModule->is_restarted_syscall()) {
616 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
617 		status_t status = _WaitForEstablished(locker, timeout);
618 		TRACE("  Connect(): Connection complete: %s (timeout was %"
619 			B_PRIdBIGTIME ")", strerror(status), timeout);
620 		return posix_error(status);
621 	}
622 
623 	// Can only call connect() from CLOSED or LISTEN states
624 	// otherwise endpoint is considered already connected
625 	if (fState == LISTEN) {
626 		// this socket is about to connect; remove pending connections in the backlog
627 		gSocketModule->set_max_backlog(socket, 0);
628 	} else if (fState == ESTABLISHED) {
629 		return EISCONN;
630 	} else if (fState != CLOSED)
631 		return EALREADY;
632 
633 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
634 	sockaddr_storage _address;
635 	if (AddressModule()->is_empty_address(address, false)) {
636 		AddressModule()->get_loopback_address((sockaddr *)&_address);
637 		// for IPv4 and IPv6 the port is at the same offset
638 		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
639 		address = (sockaddr *)&_address;
640 	}
641 
642 	status_t status = _PrepareSendPath(address);
643 	if (status < B_OK)
644 		return status;
645 
646 	TRACE("  Connect(): starting 3-way handshake...");
647 
648 	fState = SYNCHRONIZE_SENT;
649 	T(State(this));
650 
651 	// send SYN
652 	status = _SendAcknowledge();
653 	if (status != B_OK) {
654 		_Close();
655 		return status;
656 	}
657 
658 	// If we are running over Loopback, after _SendQueued() returns we
659 	// may be in ESTABLISHED already.
660 	if (fState == ESTABLISHED) {
661 		TRACE("  Connect() completed after _SendQueued()");
662 		return B_OK;
663 	}
664 
665 	// wait until 3-way handshake is complete (if needed)
666 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
667 	if (timeout == 0) {
668 		// we're a non-blocking socket
669 		TRACE("  Connect() delayed, return EINPROGRESS");
670 		return EINPROGRESS;
671 	}
672 
673 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
674 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
675 
676 	status = _WaitForEstablished(locker, absoluteTimeout);
677 	TRACE("  Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
678 		")", strerror(status), timeout);
679 	return posix_error(status);
680 }
681 
682 
683 status_t
684 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
685 {
686 	MutexLocker locker(fLock);
687 
688 	TRACE("Accept()");
689 	T(APICall(this, "accept"));
690 
691 	status_t status;
692 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
693 	if (gStackModule->is_restarted_syscall())
694 		timeout = gStackModule->restore_syscall_restart_timeout();
695 	else
696 		gStackModule->store_syscall_restart_timeout(timeout);
697 
698 	do {
699 		locker.Unlock();
700 
701 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
702 			| B_CAN_INTERRUPT, timeout);
703 		if (status != B_OK) {
704 			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
705 				return B_WOULD_BLOCK;
706 
707 			return status;
708 		}
709 
710 		locker.Lock();
711 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
712 #ifdef TRACE_TCP
713 		if (status == B_OK)
714 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
715 #endif
716 	} while (status != B_OK);
717 
718 	return status;
719 }
720 
721 
722 status_t
723 TCPEndpoint::Bind(const sockaddr *address)
724 {
725 	if (address == NULL)
726 		return B_BAD_VALUE;
727 
728 	MutexLocker lock(fLock);
729 
730 	TRACE("Bind() on address %s", PrintAddress(address));
731 	T(APICall(this, "bind"));
732 
733 	if (fState != CLOSED)
734 		return EISCONN;
735 
736 	return fManager->Bind(this, address);
737 }
738 
739 
740 status_t
741 TCPEndpoint::Unbind(struct sockaddr *address)
742 {
743 	MutexLocker _(fLock);
744 
745 	TRACE("Unbind()");
746 	T(APICall(this, "unbind"));
747 
748 	return fManager->Unbind(this);
749 }
750 
751 
752 status_t
753 TCPEndpoint::Listen(int count)
754 {
755 	MutexLocker _(fLock);
756 
757 	TRACE("Listen()");
758 	T(APICall(this, "listen"));
759 
760 	if (fState != CLOSED && fState != LISTEN)
761 		return B_BAD_VALUE;
762 
763 	if (fState == CLOSED) {
764 		fAcceptSemaphore = create_sem(0, "tcp accept");
765 		if (fAcceptSemaphore < B_OK)
766 			return ENOBUFS;
767 
768 		status_t status = fManager->SetPassive(this);
769 		if (status != B_OK) {
770 			delete_sem(fAcceptSemaphore);
771 			fAcceptSemaphore = -1;
772 			return status;
773 		}
774 	}
775 
776 	gSocketModule->set_max_backlog(socket, count);
777 
778 	fState = LISTEN;
779 	T(State(this));
780 	return B_OK;
781 }
782 
783 
784 status_t
785 TCPEndpoint::Shutdown(int direction)
786 {
787 	MutexLocker lock(fLock);
788 
789 	TRACE("Shutdown(%i)", direction);
790 	T(APICall(this, "shutdown"));
791 
792 	if (direction == SHUT_RD || direction == SHUT_RDWR)
793 		fFlags |= FLAG_NO_RECEIVE;
794 
795 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
796 		// TODO: That's not correct. After read/write shutting down the socket
797 		// one should still be able to read previously arrived data.
798 		_Disconnect(false);
799 	}
800 
801 	return B_OK;
802 }
803 
804 
805 /*!	Puts data contained in \a buffer into send buffer */
806 status_t
807 TCPEndpoint::SendData(net_buffer *buffer)
808 {
809 	MutexLocker lock(fLock);
810 
811 	TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
812 		") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
813 		buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
814 	T(APICall(this, "senddata"));
815 
816 	const uint32 flags = buffer->flags;
817 	if ((flags & ~(MSG_DONTWAIT | MSG_OOB | MSG_EOF)) != 0)
818 		return EOPNOTSUPP;
819 
820 	if (fState == CLOSED)
821 		return ENOTCONN;
822 	if (fState == LISTEN)
823 		return EDESTADDRREQ;
824 	if (!is_writable(fState) && !is_establishing(fState))
825 		return EPIPE;
826 
827 	size_t left = buffer->size;
828 
829 	bigtime_t timeout = 0;
830 	if ((flags & MSG_DONTWAIT) == 0) {
831 		timeout = absolute_timeout(socket->send.timeout);
832 		if (gStackModule->is_restarted_syscall())
833 			timeout = gStackModule->restore_syscall_restart_timeout();
834 		else
835 			gStackModule->store_syscall_restart_timeout(timeout);
836 	}
837 
838 	while (left > 0) {
839 		while (fSendQueue.Free() < socket->send.low_water_mark) {
840 			// initiate a send before waiting
841 			_SendQueued();
842 
843 			// wait until enough space is available
844 			status_t status = _WaitForCondition(fSendCondition, lock, timeout);
845 			if (status < B_OK) {
846 				TRACE("  SendData() returning %s (%d)",
847 					strerror(posix_error(status)), (int)posix_error(status));
848 				return posix_error(status);
849 			}
850 
851 			if (!is_writable(fState) && !is_establishing(fState))
852 				return EPIPE;
853 		}
854 
855 		size_t size = fSendQueue.Free();
856 		if (size < left) {
857 			// we need to split the original buffer
858 			net_buffer* clone = gBufferModule->clone(buffer, false);
859 				// TODO: add offset/size parameter to net_buffer::clone() or
860 				// even a move_data() function, as this is a bit inefficient
861 			if (clone == NULL)
862 				return ENOBUFS;
863 
864 			status_t status = gBufferModule->trim(clone, size);
865 			if (status != B_OK) {
866 				gBufferModule->free(clone);
867 				return status;
868 			}
869 
870 			gBufferModule->remove_header(buffer, size);
871 			left -= size;
872 			fSendQueue.Add(clone);
873 		} else {
874 			left -= buffer->size;
875 			fSendQueue.Add(buffer);
876 		}
877 	}
878 
879 	TRACE("  SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
880 
881 	bool force = false;
882 	if ((flags & MSG_OOB) != 0) {
883 		fSendUrgentOffset = fSendQueue.LastSequence();
884 			// RFC 961 specifies that the urgent offset points to the last
885 			// byte of urgent data. However, this is commonly implemented as
886 			// here, ie. it points to the first byte after the urgent data.
887 		force = true;
888 	}
889 	if ((flags & MSG_EOF) != 0)
890 		_Disconnect(false);
891 
892 	_SendQueued(force);
893 
894 	return B_OK;
895 }
896 
897 
898 ssize_t
899 TCPEndpoint::SendAvailable()
900 {
901 	MutexLocker locker(fLock);
902 
903 	ssize_t available;
904 
905 	if (is_writable(fState))
906 		available = fSendQueue.Free();
907 	else if (is_establishing(fState))
908 		available = 0;
909 	else
910 		available = EPIPE;
911 
912 	TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
913 	T(APICall(this, "sendavailable"));
914 	return available;
915 }
916 
917 
918 status_t
919 TCPEndpoint::FillStat(net_stat *stat)
920 {
921 	MutexLocker _(fLock);
922 
923 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
924 	stat->receive_queue_size = fReceiveQueue.Available();
925 	stat->send_queue_size = fSendQueue.Used();
926 
927 	return B_OK;
928 }
929 
930 
931 status_t
932 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
933 {
934 	if ((flags & ~(MSG_DONTWAIT | MSG_WAITALL | MSG_PEEK)) != 0)
935 		return EOPNOTSUPP;
936 
937 	MutexLocker locker(fLock);
938 
939 	TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
940 		flags);
941 	T(APICall(this, "readdata"));
942 
943 	*_buffer = NULL;
944 
945 	if (fState == CLOSED || fState == LISTEN) {
946 		if (socket->error != B_OK)
947 			return socket->error;
948 		return ENOTCONN;
949 	}
950 
951 	bigtime_t timeout = 0;
952 	if ((flags & MSG_DONTWAIT) == 0) {
953 		timeout = absolute_timeout(socket->receive.timeout);
954 		if (gStackModule->is_restarted_syscall())
955 			timeout = gStackModule->restore_syscall_restart_timeout();
956 		else
957 			gStackModule->store_syscall_restart_timeout(timeout);
958 	}
959 
960 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
961 		if (flags & MSG_DONTWAIT)
962 			return B_WOULD_BLOCK;
963 
964 		status_t status = _WaitForEstablished(locker, timeout);
965 		if (status < B_OK)
966 			return posix_error(status);
967 	}
968 
969 	size_t dataNeeded = socket->receive.low_water_mark;
970 
971 	// When MSG_WAITALL is set then the function should block
972 	// until the full amount of data can be returned.
973 	if (flags & MSG_WAITALL)
974 		dataNeeded = numBytes;
975 
976 	// TODO: add support for urgent data (MSG_OOB)
977 
978 	while (true) {
979 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
980 			|| fState == TIME_WAIT) {
981 			// ``Connection closing''.
982 			if (fReceiveQueue.Available() > 0)
983 				break;
984 			return B_OK;
985 		}
986 
987 		if (fReceiveQueue.Available() > 0) {
988 			if (fReceiveQueue.Available() >= dataNeeded
989 				|| (fReceiveQueue.PushedData() > 0
990 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
991 				break;
992 		} else if (fState == FINISH_RECEIVED) {
993 			// ``If no text is awaiting delivery, the RECEIVE will
994 			//   get a Connection closing''.
995 			return B_OK;
996 		}
997 
998 		if (timeout == 0)
999 			return B_WOULD_BLOCK;
1000 
1001 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1002 			return B_OK;
1003 
1004 		status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
1005 		if (status < B_OK) {
1006 			// The Open Group base specification mentions that EINTR should be
1007 			// returned if the recv() is interrupted before _any data_ is
1008 			// available. So we actually check if there is data, and if so,
1009 			// push it to the user.
1010 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
1011 				&& fReceiveQueue.Available() > 0)
1012 				break;
1013 
1014 			return posix_error(status);
1015 		}
1016 	}
1017 
1018 	TRACE("  ReadData(): %" B_PRIuSIZE " are available.",
1019 		fReceiveQueue.Available());
1020 
1021 	if (numBytes < fReceiveQueue.Available())
1022 		fReceiveCondition.NotifyAll();
1023 
1024 	bool clone = (flags & MSG_PEEK) != 0;
1025 
1026 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
1027 
1028 	TRACE("  ReadData(): %" B_PRIuSIZE " bytes kept.",
1029 		fReceiveQueue.Available());
1030 
1031 	if (fReceiveQueue.Available() == 0 && fState == FINISH_RECEIVED)
1032 		socket->receive.low_water_mark = 0;
1033 
1034 	// if we are opening the window, check if we should send an ACK
1035 	if (!clone)
1036 		_SendAcknowledge();
1037 
1038 	return receivedBytes;
1039 }
1040 
1041 
1042 ssize_t
1043 TCPEndpoint::ReadAvailable()
1044 {
1045 	MutexLocker locker(fLock);
1046 
1047 	TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1048 	T(APICall(this, "readavailable"));
1049 
1050 	return _AvailableData();
1051 }
1052 
1053 
1054 status_t
1055 TCPEndpoint::SetSendBufferSize(size_t length)
1056 {
1057 	MutexLocker _(fLock);
1058 	fSendQueue.SetMaxBytes(length);
1059 	return B_OK;
1060 }
1061 
1062 
1063 status_t
1064 TCPEndpoint::SetReceiveBufferSize(size_t length)
1065 {
1066 	MutexLocker _(fLock);
1067 	fReceiveQueue.SetMaxBytes(length);
1068 	return B_OK;
1069 }
1070 
1071 
1072 status_t
1073 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1074 {
1075 	if (*_length != sizeof(int))
1076 		return B_BAD_VALUE;
1077 
1078 	int* value = (int*)_value;
1079 
1080 	switch (option) {
1081 		case TCP_NODELAY:
1082 			if ((fOptions & TCP_NODELAY) != 0)
1083 				*value = 1;
1084 			else
1085 				*value = 0;
1086 			return B_OK;
1087 
1088 		case TCP_MAXSEG:
1089 			*value = fReceiveMaxSegmentSize;
1090 			return B_OK;
1091 
1092 		default:
1093 			return B_BAD_VALUE;
1094 	}
1095 }
1096 
1097 
1098 status_t
1099 TCPEndpoint::SetOption(int option, const void* _value, int length)
1100 {
1101 	if (option != TCP_NODELAY)
1102 		return B_BAD_VALUE;
1103 
1104 	if (length != sizeof(int))
1105 		return B_BAD_VALUE;
1106 
1107 	const int* value = (const int*)_value;
1108 
1109 	MutexLocker _(fLock);
1110 	if (*value)
1111 		fOptions |= TCP_NODELAY;
1112 	else
1113 		fOptions &= ~TCP_NODELAY;
1114 
1115 	return B_OK;
1116 }
1117 
1118 
1119 //	#pragma mark - misc
1120 
1121 
1122 bool
1123 TCPEndpoint::IsBound() const
1124 {
1125 	return !LocalAddress().IsEmpty(true);
1126 }
1127 
1128 
1129 bool
1130 TCPEndpoint::IsLocal() const
1131 {
1132 	return (fFlags & FLAG_LOCAL) != 0;
1133 }
1134 
1135 
1136 status_t
1137 TCPEndpoint::DelayedAcknowledge()
1138 {
1139 	// ACKs "MUST" be generated within 500ms of the first unACKed packet, and
1140 	// "SHOULD" be for at least every second full-size segment. (RFC 5681 § 4.2)
1141 
1142 	bigtime_t delay = TCP_DELAYED_ACKNOWLEDGE_TIMEOUT;
1143 	if ((fReceiveNext - fLastAcknowledgeSent) >= (fReceiveMaxSegmentSize * 2)) {
1144 		// Trigger an immediate timeout rather than invoking Send directly,
1145 		// allowing multiple invocations to be coalesced.
1146 		delay = 0;
1147 	} else if (gStackModule->is_timer_active(&fDelayedAcknowledgeTimer)) {
1148 		return B_OK;
1149 	}
1150 
1151 	gStackModule->set_timer(&fDelayedAcknowledgeTimer, delay);
1152 	T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1153 	return B_OK;
1154 }
1155 
1156 
1157 void
1158 TCPEndpoint::_StartPersistTimer()
1159 {
1160 	gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1161 	T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1162 }
1163 
1164 
1165 void
1166 TCPEndpoint::_EnterTimeWait()
1167 {
1168 	TRACE("_EnterTimeWait()");
1169 
1170 	if (fState == TIME_WAIT) {
1171 		_CancelConnectionTimers();
1172 	}
1173 
1174 	_UpdateTimeWait();
1175 }
1176 
1177 
1178 void
1179 TCPEndpoint::_UpdateTimeWait()
1180 {
1181 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1182 	T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1183 }
1184 
1185 
1186 void
1187 TCPEndpoint::_CancelConnectionTimers()
1188 {
1189 	gStackModule->cancel_timer(&fRetransmitTimer);
1190 	T(TimerSet(this, "retransmit", -1));
1191 	gStackModule->cancel_timer(&fPersistTimer);
1192 	T(TimerSet(this, "persist", -1));
1193 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1194 	T(TimerSet(this, "delayed ack", -1));
1195 }
1196 
1197 
1198 /*!	Sends the FIN flag to the peer when the connection is still open.
1199 	Moves the endpoint to the next state depending on where it was.
1200 */
1201 status_t
1202 TCPEndpoint::_Disconnect(bool closing)
1203 {
1204 	tcp_state previousState = fState;
1205 
1206 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1207 		fState = FINISH_SENT;
1208 	else if (fState == FINISH_RECEIVED)
1209 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1210 	else
1211 		return B_OK;
1212 
1213 	T(State(this));
1214 
1215 	status_t status = _SendQueued();
1216 	if (status != B_OK) {
1217 		fState = previousState;
1218 		T(State(this));
1219 		return status;
1220 	}
1221 
1222 	return B_OK;
1223 }
1224 
1225 
1226 void
1227 TCPEndpoint::_MarkEstablished()
1228 {
1229 	fState = ESTABLISHED;
1230 	T(State(this));
1231 
1232 	gSocketModule->set_connected(socket);
1233 	if (gSocketModule->has_parent(socket))
1234 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1235 
1236 	fSendCondition.NotifyAll();
1237 	gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1238 }
1239 
1240 
1241 status_t
1242 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1243 {
1244 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1245 	// When investigating this, also have a look at _Close() and _HandleReset().
1246 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1247 		if (socket->error != B_OK)
1248 			return socket->error;
1249 
1250 		status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1251 		if (status < B_OK)
1252 			return status;
1253 	}
1254 
1255 	return B_OK;
1256 }
1257 
1258 
1259 //	#pragma mark - receive
1260 
1261 
1262 void
1263 TCPEndpoint::_Close()
1264 {
1265 	_CancelConnectionTimers();
1266 	fState = CLOSED;
1267 	T(State(this));
1268 
1269 	fFlags |= FLAG_DELETE_ON_CLOSE;
1270 
1271 	fSendCondition.NotifyAll();
1272 	_NotifyReader();
1273 
1274 	if (gSocketModule->has_parent(socket)) {
1275 		// We still have a parent - obviously, we haven't been accepted yet,
1276 		// so no one could ever close us.
1277 		_CancelConnectionTimers();
1278 		gSocketModule->set_aborted(socket);
1279 	}
1280 }
1281 
1282 
1283 void
1284 TCPEndpoint::_HandleReset(status_t error)
1285 {
1286 	socket->error = error;
1287 	_Close();
1288 
1289 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1290 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1291 }
1292 
1293 
1294 void
1295 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1296 {
1297 	if (fDuplicateAcknowledgeCount == 0)
1298 		fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number();
1299 
1300 	if (++fDuplicateAcknowledgeCount < 3) {
1301 		if (fSendQueue.Available(fSendMax) != 0  && fSendWindow != 0) {
1302 			fSendNext = fSendMax;
1303 			fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1304 			_SendQueued();
1305 			TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack");
1306 			fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1307 		}
1308 	}
1309 
1310 	if (fDuplicateAcknowledgeCount == 3) {
1311 		if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize &&
1312 			(fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) {
1313 			fFlags |= FLAG_RECOVERY;
1314 			fRecover = fSendMax.Number() - 1;
1315 			fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize);
1316 			fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1317 			fSendNext = segment.acknowledge;
1318 			_SendQueued();
1319 			TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack");
1320 		}
1321 	} else if (fDuplicateAcknowledgeCount > 3) {
1322 		uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1323 		if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize)
1324 			fCongestionWindow += fSendMaxSegmentSize;
1325 		if (fSendQueue.Available(fSendMax) != 0) {
1326 			fSendNext = fSendMax;
1327 			_SendQueued();
1328 		}
1329 	}
1330 }
1331 
1332 
1333 void
1334 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1335 	size_t segmentLength)
1336 {
1337 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1338 		tcp_sequence sequence(segment.sequence);
1339 
1340 		if (fLastAcknowledgeSent >= sequence
1341 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1342 			fReceivedTimestamp = segment.timestamp_value;
1343 	}
1344 }
1345 
1346 
1347 ssize_t
1348 TCPEndpoint::_AvailableData() const
1349 {
1350 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1351 	//       the application of FLAG_NO_RECEIVE in listen()ing
1352 	//       sockets.
1353 	if (fState == LISTEN)
1354 		return gSocketModule->count_connected(socket);
1355 	if (fState == SYNCHRONIZE_SENT)
1356 		return 0;
1357 
1358 	ssize_t availableData = fReceiveQueue.Available();
1359 
1360 	if (availableData == 0 && !_ShouldReceive())
1361 		return ENOTCONN;
1362 	if (availableData == 0 && (fState == FINISH_RECEIVED || fState == WAIT_FOR_FINISH_ACKNOWLEDGE))
1363 		return ESHUTDOWN;
1364 	return availableData;
1365 }
1366 
1367 
1368 void
1369 TCPEndpoint::_NotifyReader()
1370 {
1371 	fReceiveCondition.NotifyAll();
1372 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1373 }
1374 
1375 
1376 bool
1377 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1378 {
1379 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1380 		// Remember the position of the finish received flag
1381 		fFinishReceived = true;
1382 		fFinishReceivedAt = segment.sequence + buffer->size;
1383 	}
1384 
1385 	fReceiveQueue.Add(buffer, segment.sequence);
1386 	fReceiveNext = fReceiveQueue.NextSequence();
1387 
1388 	if (fFinishReceived) {
1389 		// Set or reset the finish flag on the current segment
1390 		if (fReceiveNext < fFinishReceivedAt)
1391 			segment.flags &= ~TCP_FLAG_FINISH;
1392 		else
1393 			segment.flags |= TCP_FLAG_FINISH;
1394 	}
1395 
1396 	TRACE("  _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1397 		B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1398 
1399 	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1400 		fReceiveQueue.SetPushPointer();
1401 
1402 	return fReceiveQueue.Available() > 0;
1403 }
1404 
1405 
1406 void
1407 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1408 {
1409 	fInitialReceiveSequence = segment.sequence;
1410 	fFinishReceived = false;
1411 
1412 	// count the received SYN
1413 	segment.sequence++;
1414 
1415 	fReceiveNext = segment.sequence;
1416 	fReceiveQueue.SetInitialSequence(segment.sequence);
1417 
1418 	if ((fOptions & TCP_NOOPT) == 0) {
1419 		if (segment.max_segment_size > 0) {
1420 			// The maximum size of a segment that a TCP endpoint really sends,
1421 			// the "effective send MSS", MUST be the smaller of the send MSS and
1422 			// the largest transmission size permitted by the IP layer:
1423 			fSendMaxSegmentSize = min_c(segment.max_segment_size,
1424 				_MaxSegmentSize(*PeerAddress()));
1425 		}
1426 
1427 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1428 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1429 			fSendWindowShift = segment.window_shift;
1430 		} else {
1431 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1432 			fReceiveWindowShift = 0;
1433 		}
1434 
1435 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1436 			fFlags |= FLAG_OPTION_TIMESTAMP;
1437 			fReceivedTimestamp = segment.timestamp_value;
1438 		} else
1439 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1440 
1441 		if ((segment.options & TCP_SACK_PERMITTED) == 0)
1442 			fFlags &= ~FLAG_OPTION_SACK_PERMITTED;
1443 	}
1444 
1445 	if (fSendMaxSegmentSize > 2190)
1446 		fCongestionWindow = 2 * fSendMaxSegmentSize;
1447 	else if (fSendMaxSegmentSize > 1095)
1448 		fCongestionWindow = 3 * fSendMaxSegmentSize;
1449 	else
1450 		fCongestionWindow = 4 * fSendMaxSegmentSize;
1451 
1452 	fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize;
1453 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1454 }
1455 
1456 
1457 bool
1458 TCPEndpoint::_ShouldReceive() const
1459 {
1460 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1461 		return false;
1462 
1463 	return fState == ESTABLISHED || fState == FINISH_SENT
1464 		|| fState == FINISH_ACKNOWLEDGED || fState == FINISH_RECEIVED;
1465 }
1466 
1467 
1468 int32
1469 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1470 	net_buffer* buffer)
1471 {
1472 	MutexLocker _(fLock);
1473 
1474 	TRACE("Spawn()");
1475 
1476 	// TODO: proper error handling!
1477 	if (ProtocolSocket::Open() != B_OK) {
1478 		T(Error(this, "opening failed", __LINE__));
1479 		return DROP;
1480 	}
1481 
1482 	fState = SYNCHRONIZE_RECEIVED;
1483 	T(Spawn(parent, this));
1484 
1485 	fManager = parent->fManager;
1486 
1487 	if (fManager->BindChild(this, buffer->destination) != B_OK) {
1488 		T(Error(this, "binding failed", __LINE__));
1489 		return DROP;
1490 	}
1491 	if (_PrepareSendPath(buffer->source) != B_OK) {
1492 		T(Error(this, "prepare send faild", __LINE__));
1493 		return DROP;
1494 	}
1495 
1496 	fOptions = parent->fOptions;
1497 	fAcceptSemaphore = parent->fAcceptSemaphore;
1498 
1499 	_PrepareReceivePath(segment);
1500 
1501 	// send SYN+ACK
1502 	if (_SendAcknowledge() != B_OK) {
1503 		T(Error(this, "sending failed", __LINE__));
1504 		return DROP;
1505 	}
1506 
1507 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1508 		// we handled this flag now, it must not be set for further processing
1509 
1510 	return _Receive(segment, buffer);
1511 }
1512 
1513 
1514 int32
1515 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1516 {
1517 	TRACE("ListenReceive()");
1518 
1519 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1520 	// but the error behaviour differs
1521 	if (segment.flags & TCP_FLAG_RESET)
1522 		return DROP;
1523 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1524 		return DROP | RESET;
1525 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1526 		return DROP;
1527 
1528 	// TODO: drop broadcast/multicast
1529 
1530 	// spawn new endpoint for accept()
1531 	net_socket* newSocket;
1532 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1533 		T(Error(this, "spawning failed", __LINE__));
1534 		return DROP;
1535 	}
1536 
1537 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1538 		segment, buffer);
1539 }
1540 
1541 
1542 int32
1543 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1544 	net_buffer *buffer)
1545 {
1546 	TRACE("_SynchronizeSentReceive()");
1547 
1548 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1549 		&& (fInitialSendSequence >= segment.acknowledge
1550 			|| fSendMax < segment.acknowledge))
1551 		return DROP | RESET;
1552 
1553 	if (segment.flags & TCP_FLAG_RESET) {
1554 		_HandleReset(ECONNREFUSED);
1555 		return DROP;
1556 	}
1557 
1558 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1559 		return DROP;
1560 
1561 	fSendUnacknowledged = segment.acknowledge;
1562 	_PrepareReceivePath(segment);
1563 
1564 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1565 		_MarkEstablished();
1566 	} else {
1567 		// simultaneous open
1568 		fState = SYNCHRONIZE_RECEIVED;
1569 		T(State(this));
1570 	}
1571 
1572 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1573 		// we handled this flag now, it must not be set for further processing
1574 
1575 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1576 }
1577 
1578 
1579 int32
1580 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1581 {
1582 	// PAWS processing takes precedence over regular TCP acceptability check
1583 	if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) {
1584 		if ((segment.options & TCP_HAS_TIMESTAMPS) == 0)
1585 			return DROP;
1586 		if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0
1587 			&& (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX)
1588 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1589 	}
1590 
1591 	uint32 advertisedWindow = segment.AdvertisedWindow(fSendWindowShift);
1592 	size_t segmentLength = buffer->size;
1593 
1594 	// First, handle the most common case for uni-directional data transfer
1595 	// (known as header prediction - the segment must not change the window,
1596 	// and must be the expected sequence, and contain no control flags)
1597 
1598 	if (fState == ESTABLISHED
1599 		&& segment.AcknowledgeOnly()
1600 		&& fReceiveNext == segment.sequence
1601 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1602 		&& fSendNext == fSendMax) {
1603 		_UpdateTimestamps(segment, segmentLength);
1604 
1605 		if (segmentLength == 0) {
1606 			// this is a pure acknowledge segment - we're on the sending end
1607 			if (fSendUnacknowledged < segment.acknowledge
1608 				&& fSendMax >= segment.acknowledge) {
1609 				_Acknowledged(segment);
1610 				return DROP;
1611 			}
1612 		} else if (segment.acknowledge == fSendUnacknowledged
1613 			&& fReceiveQueue.IsContiguous()
1614 			&& fReceiveQueue.Free() >= segmentLength
1615 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1616 			if (_AddData(segment, buffer))
1617 				_NotifyReader();
1618 
1619 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1620 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1621 		}
1622 	}
1623 
1624 	// The fast path was not applicable, so we continue with the standard
1625 	// processing of the incoming segment
1626 
1627 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1628 
1629 	if (fState != CLOSED && fState != TIME_WAIT) {
1630 		// Check sequence number
1631 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1632 				fReceiveWindow)) {
1633 			TRACE("  Receive(): segment out of window, next: %" B_PRIu32
1634 				" wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1635 			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1636 				// TODO: this doesn't look right - review!
1637 				return DROP;
1638 			}
1639 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1640 		}
1641 	}
1642 
1643 	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1644 		// Is this a valid reset?
1645 		// We generally ignore resets in time wait state (see RFC 1337)
1646 		if (fLastAcknowledgeSent <= segment.sequence
1647 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1648 				+ fReceiveWindow)
1649 			&& fState != TIME_WAIT) {
1650 			status_t error;
1651 			if (fState == SYNCHRONIZE_RECEIVED)
1652 				error = ECONNREFUSED;
1653 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1654 				error = ENOTCONN;
1655 			else
1656 				error = ECONNRESET;
1657 
1658 			_HandleReset(error);
1659 		}
1660 
1661 		return DROP;
1662 	}
1663 
1664 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1665 		|| (fState == SYNCHRONIZE_RECEIVED
1666 			&& (fInitialReceiveSequence > segment.sequence
1667 				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1668 					&& (fSendUnacknowledged > segment.acknowledge
1669 						|| fSendMax < segment.acknowledge))))) {
1670 		// reset the connection - either the initial SYN was faulty, or we
1671 		// received a SYN within the data stream
1672 		return DROP | RESET;
1673 	}
1674 
1675 	// TODO: Check this! Why do we advertize a window outside of what we should
1676 	// buffer?
1677 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1678 		// the window must not shrink
1679 
1680 	// trim buffer to be within the receive window
1681 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1682 	if (drop > 0) {
1683 		if ((uint32)drop > buffer->size
1684 			|| ((uint32)drop == buffer->size
1685 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1686 			// don't accidently remove a FIN we shouldn't remove
1687 			segment.flags &= ~TCP_FLAG_FINISH;
1688 			drop = buffer->size;
1689 		}
1690 
1691 		// remove duplicate data at the start
1692 		TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1693 		gBufferModule->remove_header(buffer, drop);
1694 		segment.sequence += drop;
1695 	}
1696 
1697 	int32 action = KEEP;
1698 
1699 	// immediately acknowledge out-of-order segment to trigger fast-retransmit at the sender
1700 	if (drop != 0)
1701 		action |= IMMEDIATE_ACKNOWLEDGE;
1702 
1703 	drop = (int32)(segment.sequence + buffer->size
1704 		- (fReceiveNext + fReceiveWindow)).Number();
1705 	if (drop > 0) {
1706 		// remove data exceeding our window
1707 		if ((uint32)drop >= buffer->size) {
1708 			// if we can accept data, or the segment is not what we'd expect,
1709 			// drop the segment (an immediate acknowledge is always triggered)
1710 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1711 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1712 
1713 			action |= IMMEDIATE_ACKNOWLEDGE;
1714 		}
1715 
1716 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1717 			// we need to remove the finish, too, as part of the data
1718 			drop--;
1719 		}
1720 
1721 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1722 		TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1723 		gBufferModule->remove_trailer(buffer, drop);
1724 	}
1725 
1726 #ifdef TRACE_TCP
1727 	if (advertisedWindow > fSendWindow) {
1728 		TRACE("  Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1729 			fSendWindow, advertisedWindow);
1730 	}
1731 #endif
1732 
1733 	if (fSendWindow < fSendMaxSegmentSize
1734 			&& advertisedWindow >= fSendMaxSegmentSize) {
1735 		// Our current send window is less than a segment wide, and the new one
1736 		// is larger, so trigger a send in case there's anything to be sent.
1737 		action |= SEND_QUEUED;
1738 	}
1739 
1740 	fSendWindow = advertisedWindow;
1741 	if (advertisedWindow > fSendMaxWindow)
1742 		fSendMaxWindow = advertisedWindow;
1743 
1744 	// Then look at the acknowledgement for any updates
1745 
1746 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1747 		// process acknowledged data
1748 		if (fState == SYNCHRONIZE_RECEIVED)
1749 			_MarkEstablished();
1750 
1751 		if (fSendMax < segment.acknowledge)
1752 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1753 
1754 		if (segment.acknowledge == fSendUnacknowledged) {
1755 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1756 				&& (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) {
1757 				TRACE("Receive(): duplicate ack!");
1758 				_DuplicateAcknowledge(segment);
1759 			}
1760 		} else if (segment.acknowledge < fSendUnacknowledged) {
1761 			return DROP;
1762 		} else {
1763 			// this segment acknowledges in flight data
1764 
1765 			if (fDuplicateAcknowledgeCount >= 3) {
1766 				// deflate the window.
1767 				if (segment.acknowledge > fRecover) {
1768 					uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1769 					fCongestionWindow = min_c(fSlowStartThreshold,
1770 						max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize);
1771 					fFlags &= ~FLAG_RECOVERY;
1772 				}
1773 			}
1774 
1775 			if (fSendMax == segment.acknowledge)
1776 				TRACE("Receive(): all inflight data ack'd!");
1777 
1778 			if (segment.acknowledge > fSendQueue.LastSequence()
1779 					&& fState > ESTABLISHED) {
1780 				TRACE("Receive(): FIN has been acknowledged!");
1781 
1782 				switch (fState) {
1783 					case FINISH_SENT:
1784 						fState = FINISH_ACKNOWLEDGED;
1785 						T(State(this));
1786 						break;
1787 					case CLOSING:
1788 						fState = TIME_WAIT;
1789 						T(State(this));
1790 						_EnterTimeWait();
1791 						return DROP;
1792 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1793 						_Close();
1794 						break;
1795 
1796 					default:
1797 						break;
1798 				}
1799 			}
1800 
1801 			if (fState != CLOSED) {
1802 				tcp_sequence last = fLastAcknowledgeSent;
1803 				_Acknowledged(segment);
1804 				// we just sent an acknowledge, remove from action
1805 				if (last < fLastAcknowledgeSent)
1806 					action &= ~IMMEDIATE_ACKNOWLEDGE;
1807 			}
1808 		}
1809 	}
1810 
1811 	if (segment.flags & TCP_FLAG_URGENT) {
1812 		if (fState == ESTABLISHED || fState == FINISH_SENT
1813 			|| fState == FINISH_ACKNOWLEDGED) {
1814 			// TODO: Handle urgent data:
1815 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1816 			//  - signal the user that urgent data is available (SIGURG)
1817 		}
1818 	}
1819 
1820 	bool notify = false;
1821 
1822 	// The buffer may be freed if its data is added to the queue, so cache
1823 	// the size as we still need it later.
1824 	const uint32 bufferSize = buffer->size;
1825 
1826 	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1827 		&& _ShouldReceive())
1828 		notify = _AddData(segment, buffer);
1829 	else {
1830 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1831 			fReceiveNext += buffer->size;
1832 
1833 		action = (action & ~KEEP) | DROP;
1834 	}
1835 
1836 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1837 		segmentLength++;
1838 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1839 			TRACE("Receive(): peer is finishing connection!");
1840 			fReceiveNext++;
1841 			notify = true;
1842 
1843 			// FIN implies PUSH
1844 			fReceiveQueue.SetPushPointer();
1845 
1846 			// we'll reply immediately to the FIN if we are not
1847 			// transitioning to TIME WAIT so we immediatly ACK it.
1848 			action |= IMMEDIATE_ACKNOWLEDGE;
1849 
1850 			// other side is closing connection; change states
1851 			switch (fState) {
1852 				case ESTABLISHED:
1853 				case SYNCHRONIZE_RECEIVED:
1854 					fState = FINISH_RECEIVED;
1855 					T(State(this));
1856 					break;
1857 				case FINISH_SENT:
1858 					// simultaneous close
1859 					fState = CLOSING;
1860 					T(State(this));
1861 					break;
1862 				case FINISH_ACKNOWLEDGED:
1863 					fState = TIME_WAIT;
1864 					T(State(this));
1865 					_EnterTimeWait();
1866 					break;
1867 				case TIME_WAIT:
1868 					_UpdateTimeWait();
1869 					break;
1870 
1871 				default:
1872 					break;
1873 			}
1874 		}
1875 	}
1876 
1877 	if (notify)
1878 		_NotifyReader();
1879 
1880 	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1881 		action |= ACKNOWLEDGE;
1882 
1883 	_UpdateTimestamps(segment, segmentLength);
1884 
1885 	TRACE("Receive() Action %" B_PRId32, action);
1886 
1887 	return action;
1888 }
1889 
1890 
1891 int32
1892 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1893 {
1894 	MutexLocker locker(fLock);
1895 
1896 	TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1897 		"to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1898 		", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1899 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1900 		segment.acknowledge,
1901 		(uint32)segment.advertised_window << fSendWindowShift);
1902 	T(Receive(this, segment,
1903 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1904 	int32 segmentAction = DROP;
1905 
1906 	switch (fState) {
1907 		case LISTEN:
1908 			segmentAction = _ListenReceive(segment, buffer);
1909 			break;
1910 
1911 		case SYNCHRONIZE_SENT:
1912 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1913 			break;
1914 
1915 		case SYNCHRONIZE_RECEIVED:
1916 		case ESTABLISHED:
1917 		case FINISH_RECEIVED:
1918 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1919 		case FINISH_SENT:
1920 		case FINISH_ACKNOWLEDGED:
1921 		case CLOSING:
1922 		case TIME_WAIT:
1923 		case CLOSED:
1924 			segmentAction = _Receive(segment, buffer);
1925 			break;
1926 	}
1927 
1928 	// process acknowledge action as asked for by the *Receive() method
1929 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1930 		_SendAcknowledge(true);
1931 	else if (segmentAction & ACKNOWLEDGE)
1932 		DelayedAcknowledge();
1933 
1934 	if (segmentAction & SEND_QUEUED)
1935 		_SendQueued();
1936 
1937 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1938 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1939 
1940 		locker.Unlock();
1941 		if (gSocketModule->release_socket(socket))
1942 			segmentAction |= DELETED_ENDPOINT;
1943 	}
1944 
1945 	return segmentAction;
1946 }
1947 
1948 
1949 //	#pragma mark - send
1950 
1951 
1952 tcp_segment_header
1953 TCPEndpoint::_PrepareSendSegment()
1954 {
1955 	// we don't set FLAG_FINISH here, instead we do it
1956 	// conditionally below depending if we are sending
1957 	// the last bytes of the send queue.
1958 
1959 	uint8 flags = 0;
1960 	switch (fState) {
1961 		case CLOSED:
1962 			flags = TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1963 			break;
1964 
1965 		case SYNCHRONIZE_SENT:
1966 			flags = TCP_FLAG_SYNCHRONIZE;
1967 			break;
1968 
1969 		case SYNCHRONIZE_RECEIVED:
1970 			flags = TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1971 			break;
1972 
1973 		case ESTABLISHED:
1974 		case FINISH_RECEIVED:
1975 		case FINISH_ACKNOWLEDGED:
1976 		case TIME_WAIT:
1977 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1978 		case FINISH_SENT:
1979 		case CLOSING:
1980 			flags = TCP_FLAG_ACKNOWLEDGE;
1981 
1982 		default:
1983 			break;
1984 	}
1985 
1986 	tcp_segment_header segment(flags);
1987 
1988 	if ((fOptions & TCP_NOOPT) == 0) {
1989 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1990 			segment.options |= TCP_HAS_TIMESTAMPS;
1991 			segment.timestamp_reply = fReceivedTimestamp;
1992 			segment.timestamp_value = tcp_now();
1993 		}
1994 
1995 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1996 				&& fSendNext == fInitialSendSequence) {
1997 			// add connection establishment options
1998 			segment.max_segment_size = fReceiveMaxSegmentSize;
1999 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
2000 				segment.options |= TCP_HAS_WINDOW_SCALE;
2001 				segment.window_shift = fReceiveWindowShift;
2002 			}
2003 			if ((fFlags & FLAG_OPTION_SACK_PERMITTED) != 0)
2004 				segment.options |= TCP_SACK_PERMITTED;
2005 		}
2006 
2007 		if (!fReceiveQueue.IsContiguous()
2008 				&& (fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) {
2009 			segment.options |= TCP_HAS_SACK;
2010 			int maxSackCount = MAX_SACK_BLKS
2011 				- ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) ? 1 : 0;
2012 			memset(segment.sacks, 0, sizeof(segment.sacks));
2013 			segment.sackCount = fReceiveQueue.PopulateSackInfo(fReceiveNext,
2014 				maxSackCount, segment.sacks);
2015 		}
2016 	}
2017 
2018 	size_t availableBytes = fReceiveQueue.Free();
2019 	// window size must remain same for duplicate acknowledgements
2020 	if (!fReceiveQueue.IsContiguous())
2021 		availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number();
2022 	segment.SetAdvertisedWindow(availableBytes, fReceiveWindowShift);
2023 
2024 	segment.acknowledge = fReceiveNext.Number();
2025 
2026 	// Process urgent data
2027 	if (fSendUrgentOffset > fSendNext) {
2028 		segment.flags |= TCP_FLAG_URGENT;
2029 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
2030 	} else {
2031 		fSendUrgentOffset = fSendUnacknowledged.Number();
2032 			// Keep urgent offset updated, so that it doesn't reach into our
2033 			// send window on overlap
2034 		segment.urgent_offset = 0;
2035 	}
2036 
2037 	return segment;
2038 }
2039 
2040 
2041 status_t
2042 TCPEndpoint::_PrepareAndSend(tcp_segment_header& segment, net_buffer* buffer,
2043 	bool isRetransmit)
2044 {
2045 	LocalAddress().CopyTo(buffer->source);
2046 	PeerAddress().CopyTo(buffer->destination);
2047 
2048 	uint32 size = buffer->size, segmentLength = size;
2049 	segment.sequence = fSendNext.Number();
2050 
2051 	TRACE("_PrepareAndSend(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2052 		"%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2053 		", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2054 		", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2055 		buffer, buffer->size, PrintAddress(buffer->source),
2056 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
2057 		segment.acknowledge, segment.advertised_window,
2058 		fCongestionWindow, fSlowStartThreshold, segmentLength,
2059 		fSendQueue.FirstSequence().Number(),
2060 		fSendQueue.LastSequence().Number());
2061 	T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2062 		fSendQueue.LastSequence()));
2063 
2064 	PROBE(buffer, sendWindow);
2065 
2066 	status_t status = add_tcp_header(AddressModule(), segment, buffer);
2067 	if (status != B_OK) {
2068 		gBufferModule->free(buffer);
2069 		return status;
2070 	}
2071 
2072 	if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2073 		segment.options &= ~TCP_HAS_WINDOW_SCALE;
2074 		segment.max_segment_size = 0;
2075 		size++;
2076 	}
2077 
2078 	if (segment.flags & TCP_FLAG_FINISH)
2079 		size++;
2080 
2081 	status = next->module->send_routed_data(next, fRoute, buffer);
2082 	if (status < B_OK) {
2083 		gBufferModule->free(buffer);
2084 		return status;
2085 	}
2086 
2087 	fSendNext += size;
2088 	if (fSendMax < fSendNext)
2089 		fSendMax = fSendNext;
2090 
2091 	fReceiveMaxAdvertised = fReceiveNext + segment.AdvertisedWindow(fReceiveWindowShift);
2092 
2093 	if (segmentLength != 0 && fState == ESTABLISHED)
2094 		--fSendMaxSegments;
2095 
2096 	if (fSendTime == 0 && !isRetransmit
2097 			&& (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)) {
2098 		fSendTime = tcp_now();
2099 		fRoundTripStartSequence = segment.sequence;
2100 	}
2101 
2102 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
2103 		fLastAcknowledgeSent = segment.acknowledge;
2104 		gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
2105 	}
2106 
2107 	return B_OK;
2108 }
2109 
2110 
2111 inline bool
2112 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
2113 	uint32 segmentMaxSize, uint32 flightSize)
2114 {
2115 	if (fState == ESTABLISHED && fSendMaxSegments == 0)
2116 		return false;
2117 
2118 	if (length > 0) {
2119 		// Avoid the silly window syndrome - we only send a segment in case:
2120 		// - we have a full segment to send, or
2121 		// - we're at the end of our buffer queue, or
2122 		// - the buffer is at least larger than half of the maximum send window,
2123 		//   or
2124 		// - we're retransmitting data
2125 		if (length == segmentMaxSize
2126 			|| (fOptions & TCP_NODELAY) != 0
2127 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
2128 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
2129 			return true;
2130 	}
2131 
2132 	// check if we need to send a window update to the peer
2133 	if (segment.advertised_window > 0) {
2134 		// correct the window to take into account what already has been advertised
2135 		uint32 window = segment.AdvertisedWindow(fReceiveWindowShift)
2136 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
2137 
2138 		// if we can advertise a window larger than twice the maximum segment
2139 		// size, or half the maximum buffer size we send a window update
2140 		if (window >= (fReceiveMaxSegmentSize << 1)
2141 			|| window >= (socket->receive.buffer_size >> 1))
2142 			return true;
2143 	}
2144 
2145 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
2146 			| TCP_FLAG_RESET)) != 0)
2147 		return true;
2148 
2149 	// We do have urgent data pending
2150 	if (fSendUrgentOffset > fSendNext)
2151 		return true;
2152 
2153 	// there is no reason to send a segment just now
2154 	return false;
2155 }
2156 
2157 
2158 status_t
2159 TCPEndpoint::_SendAcknowledge(bool force)
2160 {
2161 	if (fRoute == NULL || fState == LISTEN)
2162 		return B_ERROR;
2163 
2164 	tcp_segment_header segment = _PrepareSendSegment();
2165 
2166 	// Is there actually anything to do?
2167 	if (!force && fState == ESTABLISHED
2168 			&& fLastAcknowledgeSent == fReceiveNext
2169 			&& fReceiveQueue.IsContiguous()
2170 			&& !_ShouldSendSegment(segment, 0, 0, 0))
2171 		return B_OK;
2172 
2173 	net_buffer* buffer = gBufferModule->create(256);
2174 	if (buffer == NULL)
2175 		return B_NO_MEMORY;
2176 
2177 	return _PrepareAndSend(segment, buffer, false);
2178 }
2179 
2180 
2181 /*!	Sends one or more TCP segments with the data waiting in the queue. */
2182 status_t
2183 TCPEndpoint::_SendQueued(bool force)
2184 {
2185 	if (fRoute == NULL || fState < ESTABLISHED)
2186 		return B_ERROR;
2187 
2188 	tcp_segment_header segment = _PrepareSendSegment();
2189 
2190 	uint32 sendWindow = fSendWindow;
2191 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
2192 		sendWindow = fCongestionWindow;
2193 
2194 	// fSendUnacknowledged
2195 	//  |    fSendNext      fSendMax
2196 	//  |        |              |
2197 	//  v        v              v
2198 	//  -----------------------------------
2199 	//  | effective window           |
2200 	//  -----------------------------------
2201 
2202 	// Flight size represents the window of data which is currently in the
2203 	// ether. We should never send data such as the flight size becomes larger
2204 	// than the effective window. Note however that the effective window may be
2205 	// reduced (by congestion for instance), so at some point in time flight
2206 	// size may be larger than the currently calculated window.
2207 
2208 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2209 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2210 
2211 	if (consumedWindow > sendWindow) {
2212 		sendWindow = 0;
2213 		// TODO: enter persist state? try to get a window update.
2214 	} else
2215 		sendWindow -= consumedWindow;
2216 
2217 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2218 	if (length == 0 && !state_needs_finish(fState)) {
2219 		// Nothing to send.
2220 		return B_OK;
2221 	}
2222 
2223 	bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2224 	bool retransmit = fSendNext < fSendMax;
2225 
2226 	if (fDuplicateAcknowledgeCount != 0) {
2227 		// send at most 1 SMSS of data when under limited transmit, fast transmit/recovery
2228 		length = min_c(length, fSendMaxSegmentSize);
2229 	}
2230 
2231 	do {
2232 		uint32 segmentMaxSize = fSendMaxSegmentSize
2233 			- tcp_options_length(segment);
2234 		uint32 segmentLength = min_c(length, segmentMaxSize);
2235 
2236 		if ((fSendNext + segmentLength) == fSendQueue.LastSequence() && !force) {
2237 			if (state_needs_finish(fState))
2238 				segment.flags |= TCP_FLAG_FINISH;
2239 			if (length > 0)
2240 				segment.flags |= TCP_FLAG_PUSH;
2241 		}
2242 
2243 		// Determine if we should really send this segment
2244 		if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2245 				segmentMaxSize, flightSize)) {
2246 			if (fSendQueue.Available()
2247 				&& !gStackModule->is_timer_active(&fPersistTimer)
2248 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2249 				_StartPersistTimer();
2250 			break;
2251 		}
2252 
2253 		net_buffer *buffer = gBufferModule->create(256);
2254 		if (buffer == NULL)
2255 			return B_NO_MEMORY;
2256 
2257 		status_t status = B_OK;
2258 		if (segmentLength > 0)
2259 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2260 		if (status < B_OK) {
2261 			gBufferModule->free(buffer);
2262 			return status;
2263 		}
2264 
2265 		sendWindow -= buffer->size;
2266 
2267 		status = _PrepareAndSend(segment, buffer, retransmit);
2268 		if (status != B_OK)
2269 			return status;
2270 
2271 		if (shouldStartRetransmitTimer) {
2272 			TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2273 				fRetransmitTimeout);
2274 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2275 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2276 			shouldStartRetransmitTimer = false;
2277 		}
2278 
2279 		length -= segmentLength;
2280 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2281 			| TCP_FLAG_FINISH);
2282 
2283 		if (retransmit)
2284 			break;
2285 
2286 	} while (length > 0);
2287 
2288 	return B_OK;
2289 }
2290 
2291 
2292 int
2293 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2294 {
2295 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2296 }
2297 
2298 
2299 status_t
2300 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2301 {
2302 	if (fRoute == NULL) {
2303 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2304 		if (fRoute == NULL)
2305 			return ENETUNREACH;
2306 
2307 		if ((fRoute->flags & RTF_LOCAL) != 0)
2308 			fFlags |= FLAG_LOCAL;
2309 	}
2310 
2311 	// make sure connection does not already exist
2312 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2313 		fRoute->interface_address->local);
2314 	if (status < B_OK)
2315 		return status;
2316 
2317 	fInitialSendSequence = system_time() >> 4;
2318 	fSendNext = fInitialSendSequence;
2319 	fSendUnacknowledged = fInitialSendSequence;
2320 	fSendMax = fInitialSendSequence;
2321 	fSendUrgentOffset = fInitialSendSequence;
2322 	fRecover = fInitialSendSequence.Number();
2323 
2324 	// we are counting the SYN here
2325 	fSendQueue.SetInitialSequence(fSendNext + 1);
2326 
2327 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2328 
2329 	// Compute the window shift we advertise to our peer - if it doesn't support
2330 	// this option, this will be reset to 0 (when its SYN is received)
2331 	fReceiveWindowShift = 0;
2332 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2333 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2334 		fReceiveWindowShift++;
2335 	}
2336 
2337 	return B_OK;
2338 }
2339 
2340 
2341 void
2342 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2343 {
2344 	TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2345 		B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2346 		fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2347 
2348 	ASSERT(fSendUnacknowledged <= segment.acknowledge);
2349 
2350 	if (fSendUnacknowledged < segment.acknowledge) {
2351 		fSendQueue.RemoveUntil(segment.acknowledge);
2352 
2353 		uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number();
2354 		fPreviousHighestAcknowledge = fSendUnacknowledged;
2355 		fSendUnacknowledged = segment.acknowledge;
2356 		uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2357 		int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1);
2358 
2359 		if (fPreviousHighestAcknowledge > fSendUnacknowledged) {
2360 			// need to update the recover variable upon a sequence wraparound
2361 			fRecover = segment.acknowledge - 1;
2362 		}
2363 
2364 		// the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window
2365 		if (fSendUnacknowledged != fInitialSendSequence) {
2366 			if (fCongestionWindow < fSlowStartThreshold)
2367 				fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize);
2368 			else {
2369 				uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2370 
2371 				if (increment < fCongestionWindow)
2372 					increment = 1;
2373 				else
2374 					increment /= fCongestionWindow;
2375 
2376 				fCongestionWindow += increment;
2377 			}
2378 
2379 			fSendMaxSegments = UINT32_MAX;
2380 		}
2381 
2382 		if ((fFlags & FLAG_RECOVERY) != 0) {
2383 			fSendNext = fSendUnacknowledged;
2384 			_SendQueued();
2385 			fCongestionWindow -= bytesAcknowledged;
2386 
2387 			if (bytesAcknowledged > fSendMaxSegmentSize)
2388 				fCongestionWindow += fSendMaxSegmentSize;
2389 
2390 			fSendNext = fSendMax;
2391 		} else
2392 			fDuplicateAcknowledgeCount = 0;
2393 
2394 		if (fSendNext < fSendUnacknowledged)
2395 			fSendNext = fSendUnacknowledged;
2396 
2397 		if (fFlags & FLAG_OPTION_TIMESTAMP) {
2398 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply),
2399 				expectedSamples > 0 ? expectedSamples : 1);
2400 		} else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) {
2401 			_UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1);
2402 			fSendTime = 0;
2403 		}
2404 
2405 		if (fSendUnacknowledged == fSendMax) {
2406 			TRACE("all acknowledged, cancelling retransmission timer.");
2407 			gStackModule->cancel_timer(&fRetransmitTimer);
2408 			T(TimerSet(this, "retransmit", -1));
2409 		} else {
2410 			TRACE("data acknowledged, resetting retransmission timer to: %"
2411 				B_PRIdBIGTIME, fRetransmitTimeout);
2412 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2413 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2414 		}
2415 
2416 		if (is_writable(fState)) {
2417 			// notify threads waiting on the socket to become writable again
2418 			fSendCondition.NotifyAll();
2419 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2420 		}
2421 	}
2422 
2423 	// if there is data left to be sent, send it now
2424 	if (fSendQueue.Used() > 0)
2425 		_SendQueued();
2426 }
2427 
2428 
2429 void
2430 TCPEndpoint::_Retransmit()
2431 {
2432 	TRACE("Retransmit()");
2433 
2434 	if (fState < ESTABLISHED) {
2435 		fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT;
2436 		fCongestionWindow = fSendMaxSegmentSize;
2437 	} else {
2438 		_ResetSlowStart();
2439 		fDuplicateAcknowledgeCount = 0;
2440 		// Do exponential back off of the retransmit timeout
2441 		fRetransmitTimeout *= 2;
2442 		if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2443 			fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2444 	}
2445 
2446 	fSendNext = fSendUnacknowledged;
2447 	_SendQueued();
2448 
2449 	fRecover = fSendNext.Number() - 1;
2450 	if ((fFlags & FLAG_RECOVERY) != 0)
2451 		fFlags &= ~FLAG_RECOVERY;
2452 }
2453 
2454 
2455 void
2456 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples)
2457 {
2458 	if (fSmoothedRoundTripTime == 0) {
2459 		fSmoothedRoundTripTime = roundTripTime;
2460 		fRoundTripVariation = roundTripTime / 2;
2461 		fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2462 				* kTimestampFactor;
2463 	} else {
2464 		int32 delta = fSmoothedRoundTripTime - roundTripTime;
2465 		if (delta < 0)
2466 			delta = -delta;
2467 		fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4);
2468 		fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8);
2469 		fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2470 			* kTimestampFactor;
2471 	}
2472 
2473 	if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2474 		fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2475 
2476 	if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2477 		fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2478 
2479 	TRACE("  RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2480 		fRetransmitTimeout, roundTripTime);
2481 }
2482 
2483 
2484 void
2485 TCPEndpoint::_ResetSlowStart()
2486 {
2487 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2488 		2 * fSendMaxSegmentSize);
2489 	fCongestionWindow = fSendMaxSegmentSize;
2490 }
2491 
2492 
2493 //	#pragma mark - timer
2494 
2495 
2496 /*static*/ void
2497 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2498 {
2499 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2500 	T(TimerTriggered(endpoint, "retransmit"));
2501 
2502 	MutexLocker locker(endpoint->fLock);
2503 	if (!locker.IsLocked() || gStackModule->is_timer_active(timer))
2504 		return;
2505 
2506 	endpoint->_Retransmit();
2507 }
2508 
2509 
2510 /*static*/ void
2511 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2512 {
2513 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2514 	T(TimerTriggered(endpoint, "persist"));
2515 
2516 	MutexLocker locker(endpoint->fLock);
2517 	if (!locker.IsLocked())
2518 		return;
2519 
2520 	// the timer might not have been canceled early enough
2521 	if (endpoint->State() == CLOSED)
2522 		return;
2523 
2524 	endpoint->_SendQueued(true);
2525 }
2526 
2527 
2528 /*static*/ void
2529 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2530 {
2531 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2532 	T(TimerTriggered(endpoint, "delayed ack"));
2533 
2534 	MutexLocker locker(endpoint->fLock);
2535 	if (!locker.IsLocked())
2536 		return;
2537 
2538 	// the timer might not have been canceled early enough
2539 	if (endpoint->State() == CLOSED)
2540 		return;
2541 
2542 	endpoint->_SendAcknowledge();
2543 }
2544 
2545 
2546 /*static*/ void
2547 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2548 {
2549 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2550 	T(TimerTriggered(endpoint, "time-wait"));
2551 
2552 	MutexLocker locker(endpoint->fLock);
2553 	if (!locker.IsLocked())
2554 		return;
2555 
2556 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2557 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2558 		return;
2559 	}
2560 
2561 	locker.Unlock();
2562 
2563 	gSocketModule->release_socket(endpoint->socket);
2564 }
2565 
2566 
2567 /*static*/ status_t
2568 TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2569 	MutexLocker& locker, bigtime_t timeout)
2570 {
2571 	ConditionVariableEntry entry;
2572 	condition.Add(&entry);
2573 
2574 	locker.Unlock();
2575 	status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2576 	locker.Lock();
2577 
2578 	return result;
2579 }
2580 
2581 
2582 //	#pragma mark -
2583 
2584 
2585 void
2586 TCPEndpoint::Dump() const
2587 {
2588 	kprintf("TCP endpoint %p\n", this);
2589 	kprintf("  state: %s\n", name_for_state(fState));
2590 	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2591 #if KDEBUG
2592 	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2593 #endif
2594 	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2595 	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2596 	kprintf("  send\n");
2597 	kprintf("    window shift: %" B_PRIu8 "\n", fSendWindowShift);
2598 	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2599 		fSendUnacknowledged.Number());
2600 	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2601 	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2602 	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2603 	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2604 	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2605 	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2606 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2607 		fSendQueue.Size());
2608 #if DEBUG_TCP_BUFFER_QUEUE
2609 	fSendQueue.Dump();
2610 #endif
2611 	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2612 		fLastAcknowledgeSent.Number());
2613 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2614 		fInitialSendSequence.Number());
2615 	kprintf("  receive\n");
2616 	kprintf("    window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2617 	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2618 	kprintf("    max advertised: %" B_PRIu32 "\n",
2619 		fReceiveMaxAdvertised.Number());
2620 	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2621 	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2622 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2623 		fReceiveQueue.Available(), fReceiveQueue.Size());
2624 #if DEBUG_TCP_BUFFER_QUEUE
2625 	fReceiveQueue.Dump();
2626 #endif
2627 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2628 		fInitialReceiveSequence.Number());
2629 	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2630 		fDuplicateAcknowledgeCount);
2631 	kprintf("  smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2632 		fSmoothedRoundTripTime, fRoundTripVariation);
2633 	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2634 	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2635 	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2636 }
2637 
2638