xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 909af08f4328301fbdef1ffb41f566c3b5bec0c7)
1 /*
2  * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <stdint.h>
22 
23 #include <KernelExport.h>
24 #include <Select.h>
25 
26 #include <net_buffer.h>
27 #include <net_datalink.h>
28 #include <net_stat.h>
29 #include <NetBufferUtilities.h>
30 #include <NetUtilities.h>
31 
32 #include <lock.h>
33 #include <low_resource_manager.h>
34 #include <tracing.h>
35 #include <util/AutoLock.h>
36 #include <util/list.h>
37 
38 #include "EndpointManager.h"
39 
40 
41 // References:
42 //	- RFC 793 - Transmission Control Protocol
43 //	- RFC 813 - Window and Acknowledgement Strategy in TCP
44 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
45 //
46 // Things incomplete in this implementation:
47 //	- TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS
48 //	- Congestion Control, RFC 5681
49 //	- Limited Transit, RFC 3042
50 //	- SACK, Selective Acknowledgment; RFC 2018, RFC 2883, RFC 6675
51 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
52 //
53 // Things this implementation currently doesn't implement:
54 //	- Explicit Congestion Notification (ECN), RFC 3168
55 //	- SYN-Cache
56 //	- Forward RTO-Recovery, RFC 4138
57 //	- Time-Wait hash instead of keeping sockets alive
58 
59 #define PrintAddress(address) \
60 	AddressString(Domain(), address, true).Data()
61 
62 //#define TRACE_TCP
63 //#define PROBE_TCP
64 
65 #ifdef TRACE_TCP
66 // the space before ', ##args' is important in order for this to work with cpp 2.95
67 #	define TRACE(format, args...)	dprintf("%" B_PRId32 ": TCP [%" \
68 		B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
69 		system_time(), this, name_for_state(fState) , ##args)
70 #else
71 #	define TRACE(args...)			do { } while (0)
72 #endif
73 
74 #ifdef PROBE_TCP
75 #	define PROBE(buffer, window) \
76 	dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
77 		" suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
78 		B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
79 		B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
80 		system_time(), PrintAddress(buffer->source), \
81 		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
82 		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
83 		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
84 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
85 #else
86 #	define PROBE(buffer, window)	do { } while (0)
87 #endif
88 
89 #if TCP_TRACING
90 namespace TCPTracing {
91 
92 class Receive : public AbstractTraceEntry {
93 public:
94 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
95 			net_buffer* buffer)
96 		:
97 		fEndpoint(endpoint),
98 		fBuffer(buffer),
99 		fBufferSize(buffer->size),
100 		fSequence(segment.sequence),
101 		fAcknowledge(segment.acknowledge),
102 		fWindow(window),
103 		fState(endpoint->State()),
104 		fFlags(segment.flags)
105 	{
106 		Initialized();
107 	}
108 
109 	virtual void AddDump(TraceOutput& out)
110 	{
111 		out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
112 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
113 			", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
114 			fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
115 	}
116 
117 protected:
118 	TCPEndpoint*	fEndpoint;
119 	net_buffer*		fBuffer;
120 	uint32			fBufferSize;
121 	uint32			fSequence;
122 	uint32			fAcknowledge;
123 	uint32			fWindow;
124 	tcp_state		fState;
125 	uint8			fFlags;
126 };
127 
128 class Send : public AbstractTraceEntry {
129 public:
130 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
131 			tcp_sequence firstSequence, tcp_sequence lastSequence)
132 		:
133 		fEndpoint(endpoint),
134 		fBuffer(buffer),
135 		fBufferSize(buffer->size),
136 		fSequence(segment.sequence),
137 		fAcknowledge(segment.acknowledge),
138 		fFirstSequence(firstSequence.Number()),
139 		fLastSequence(lastSequence.Number()),
140 		fState(endpoint->State()),
141 		fFlags(segment.flags)
142 	{
143 		Initialized();
144 	}
145 
146 	virtual void AddDump(TraceOutput& out)
147 	{
148 		out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
149 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
150 			", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
151 			name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
152 			fAcknowledge, fFirstSequence, fLastSequence);
153 	}
154 
155 protected:
156 	TCPEndpoint*	fEndpoint;
157 	net_buffer*		fBuffer;
158 	uint32			fBufferSize;
159 	uint32			fSequence;
160 	uint32			fAcknowledge;
161 	uint32			fFirstSequence;
162 	uint32			fLastSequence;
163 	tcp_state		fState;
164 	uint8			fFlags;
165 };
166 
167 class State : public AbstractTraceEntry {
168 public:
169 	State(TCPEndpoint* endpoint)
170 		:
171 		fEndpoint(endpoint),
172 		fState(endpoint->State())
173 	{
174 		Initialized();
175 	}
176 
177 	virtual void AddDump(TraceOutput& out)
178 	{
179 		out.Print("tcp:%p (%12s) state change", fEndpoint,
180 			name_for_state(fState));
181 	}
182 
183 protected:
184 	TCPEndpoint*	fEndpoint;
185 	tcp_state		fState;
186 };
187 
188 class Spawn : public AbstractTraceEntry {
189 public:
190 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
191 		:
192 		fListeningEndpoint(listeningEndpoint),
193 		fSpawnedEndpoint(spawnedEndpoint)
194 	{
195 		Initialized();
196 	}
197 
198 	virtual void AddDump(TraceOutput& out)
199 	{
200 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
201 	}
202 
203 protected:
204 	TCPEndpoint*	fListeningEndpoint;
205 	TCPEndpoint*	fSpawnedEndpoint;
206 };
207 
208 class Error : public AbstractTraceEntry {
209 public:
210 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
211 		:
212 		fEndpoint(endpoint),
213 		fLine(line),
214 		fError(error),
215 		fState(endpoint->State())
216 	{
217 		Initialized();
218 	}
219 
220 	virtual void AddDump(TraceOutput& out)
221 	{
222 		out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
223 			name_for_state(fState), fLine, fError);
224 	}
225 
226 protected:
227 	TCPEndpoint*	fEndpoint;
228 	int32			fLine;
229 	const char*		fError;
230 	tcp_state		fState;
231 };
232 
233 class TimerSet : public AbstractTraceEntry {
234 public:
235 	TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
236 		:
237 		fEndpoint(endpoint),
238 		fWhich(which),
239 		fTimeout(timeout),
240 		fState(endpoint->State())
241 	{
242 		Initialized();
243 	}
244 
245 	virtual void AddDump(TraceOutput& out)
246 	{
247 		out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
248 			name_for_state(fState), fWhich, fTimeout);
249 	}
250 
251 protected:
252 	TCPEndpoint*	fEndpoint;
253 	const char*		fWhich;
254 	bigtime_t		fTimeout;
255 	tcp_state		fState;
256 };
257 
258 class TimerTriggered : public AbstractTraceEntry {
259 public:
260 	TimerTriggered(TCPEndpoint* endpoint, const char* which)
261 		:
262 		fEndpoint(endpoint),
263 		fWhich(which),
264 		fState(endpoint->State())
265 	{
266 		Initialized();
267 	}
268 
269 	virtual void AddDump(TraceOutput& out)
270 	{
271 		out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
272 			name_for_state(fState), fWhich);
273 	}
274 
275 protected:
276 	TCPEndpoint*	fEndpoint;
277 	const char*		fWhich;
278 	tcp_state		fState;
279 };
280 
281 class APICall : public AbstractTraceEntry {
282 public:
283 	APICall(TCPEndpoint* endpoint, const char* which)
284 		:
285 		fEndpoint(endpoint),
286 		fWhich(which),
287 		fState(endpoint->State())
288 	{
289 		Initialized();
290 	}
291 
292 	virtual void AddDump(TraceOutput& out)
293 	{
294 		out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
295 			name_for_state(fState), fWhich);
296 	}
297 
298 protected:
299 	TCPEndpoint*	fEndpoint;
300 	const char*		fWhich;
301 	tcp_state		fState;
302 };
303 
304 }	// namespace TCPTracing
305 
306 #	define T(x)	new(std::nothrow) TCPTracing::x
307 #else
308 #	define T(x)
309 #endif	// TCP_TRACING
310 
311 
312 // constants for the fFlags field
313 enum {
314 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
315 	FLAG_OPTION_TIMESTAMP		= 0x02,
316 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
317 	//       That is, what is expected from accept() after a shutdown()
318 	//       is performed on a listen()ing socket.
319 	FLAG_NO_RECEIVE				= 0x04,
320 	FLAG_CLOSED					= 0x08,
321 	FLAG_DELETE_ON_CLOSE		= 0x10,
322 	FLAG_LOCAL					= 0x20,
323 	FLAG_RECOVERY				= 0x40,
324 	FLAG_OPTION_SACK_PERMITTED	= 0x80,
325 	FLAG_AUTO_RECEIVE_BUFFER_SIZE = 0x100,
326 };
327 
328 
329 static const int kTimestampFactor = 1000;
330 	// conversion factor between usec system time and msec tcp time
331 
332 
333 static inline bigtime_t
334 absolute_timeout(bigtime_t timeout)
335 {
336 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
337 		return timeout;
338 
339 	return timeout + system_time();
340 }
341 
342 
343 static inline status_t
344 posix_error(status_t error)
345 {
346 	if (error == B_TIMED_OUT)
347 		return B_WOULD_BLOCK;
348 
349 	return error;
350 }
351 
352 
353 static inline bool
354 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
355 	uint32 receiveWindow)
356 {
357 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
358 }
359 
360 
361 static inline bool
362 segment_in_sequence(const tcp_segment_header& segment, int size,
363 	const tcp_sequence& receiveNext, uint32 receiveWindow)
364 {
365 	tcp_sequence sequence(segment.sequence);
366 	if (size == 0) {
367 		if (receiveWindow == 0)
368 			return sequence == receiveNext;
369 		return in_window(sequence, receiveNext, receiveWindow);
370 	} else {
371 		if (receiveWindow == 0)
372 			return false;
373 		return in_window(sequence, receiveNext, receiveWindow)
374 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
375 	}
376 }
377 
378 
379 static inline bool
380 is_writable(tcp_state state)
381 {
382 	return state == ESTABLISHED || state == FINISH_RECEIVED;
383 }
384 
385 
386 static inline bool
387 is_establishing(tcp_state state)
388 {
389 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
390 }
391 
392 
393 static inline uint32
394 tcp_now()
395 {
396 	return system_time() / kTimestampFactor;
397 }
398 
399 
400 static inline uint32
401 tcp_diff_timestamp(uint32 base)
402 {
403 	const uint32 now = tcp_now();
404 	if (now >= base)
405 		return now - base;
406 
407 	return now + UINT_MAX - base;
408 }
409 
410 
411 static inline bool
412 state_needs_finish(int32 state)
413 {
414 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
415 		|| state == FINISH_SENT || state == CLOSING;
416 }
417 
418 
419 //	#pragma mark -
420 
421 
422 TCPEndpoint::TCPEndpoint(net_socket* socket)
423 	:
424 	ProtocolSocket(socket),
425 	fManager(NULL),
426 	fOptions(0),
427 	fSendWindowShift(0),
428 	fReceiveWindowShift(0),
429 	fSendUnacknowledged(0),
430 	fSendNext(0),
431 	fSendMax(0),
432 	fSendUrgentOffset(0),
433 	fSendWindow(0),
434 	fSendMaxWindow(0),
435 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
436 	fSendMaxSegments(0),
437 	fSendQueue(socket->send.buffer_size),
438 	fInitialSendSequence(0),
439 	fPreviousHighestAcknowledge(0),
440 	fDuplicateAcknowledgeCount(0),
441 	fPreviousFlightSize(0),
442 	fRecover(0),
443 	fRoute(NULL),
444 	fReceiveNext(0),
445 	fReceiveMaxAdvertised(0),
446 	fReceiveWindow(socket->receive.buffer_size),
447 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
448 	fReceiveQueue(socket->receive.buffer_size),
449 	fSmoothedRoundTripTime(-1),
450 	fRoundTripVariation(0),
451 	fSendTime(0),
452 	fRoundTripStartSequence(0),
453 	fRetransmitTimeout(TCP_INITIAL_RTT),
454 	fReceivedTimestamp(0),
455 	fCongestionWindow(0),
456 	fSlowStartThreshold(0),
457 	fState(CLOSED),
458 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP
459 		| FLAG_OPTION_SACK_PERMITTED | FLAG_AUTO_RECEIVE_BUFFER_SIZE)
460 {
461 	// TODO: to be replaced with a real read/write locking strategy!
462 	mutex_init(&fLock, "tcp lock");
463 
464 	fReceiveCondition.Init(this, "tcp receive");
465 	fSendCondition.Init(this, "tcp send");
466 
467 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
468 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
469 		this);
470 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
471 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
472 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
473 		this);
474 
475 	T(APICall(this, "constructor"));
476 }
477 
478 
479 TCPEndpoint::~TCPEndpoint()
480 {
481 	mutex_lock(&fLock);
482 
483 	T(APICall(this, "destructor"));
484 
485 	_CancelConnectionTimers();
486 	gStackModule->cancel_timer(&fTimeWaitTimer);
487 	T(TimerSet(this, "time-wait", -1));
488 
489 	if (fManager != NULL) {
490 		fManager->Unbind(this);
491 		put_endpoint_manager(fManager);
492 	}
493 
494 	mutex_destroy(&fLock);
495 
496 	// we need to wait for all timers to return
497 	gStackModule->wait_for_timer(&fRetransmitTimer);
498 	gStackModule->wait_for_timer(&fPersistTimer);
499 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
500 	gStackModule->wait_for_timer(&fTimeWaitTimer);
501 
502 	gDatalinkModule->put_route(Domain(), fRoute);
503 }
504 
505 
506 status_t
507 TCPEndpoint::InitCheck() const
508 {
509 	return B_OK;
510 }
511 
512 
513 //	#pragma mark - protocol API
514 
515 
516 status_t
517 TCPEndpoint::Open()
518 {
519 	TRACE("Open()");
520 	T(APICall(this, "open"));
521 
522 	status_t status = ProtocolSocket::Open();
523 	if (status < B_OK)
524 		return status;
525 
526 	fManager = get_endpoint_manager(Domain());
527 	if (fManager == NULL)
528 		return EAFNOSUPPORT;
529 
530 	return B_OK;
531 }
532 
533 
534 status_t
535 TCPEndpoint::Close()
536 {
537 	MutexLocker locker(fLock);
538 
539 	TRACE("Close()");
540 	T(APICall(this, "close"));
541 
542 	if (fState == LISTEN)
543 		delete_sem(fAcceptSemaphore);
544 
545 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
546 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
547 		fState = CLOSED;
548 		T(State(this));
549 		return B_OK;
550 	}
551 
552 	// handle linger with zero timeout
553 	if ((socket->options & SO_LINGER) != 0 && socket->linger == 0) {
554 		fState = CLOSED;
555 		T(State(this));
556 		return _SendQueued(true);
557 	}
558 
559 	status_t status = _Disconnect(true);
560 	if (status != B_OK)
561 		return status;
562 
563 	if ((socket->options & SO_LINGER) != 0) {
564 		TRACE("Close(): Lingering for %i secs", socket->linger);
565 
566 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
567 
568 		while (fSendQueue.Used() > 0) {
569 			status = _WaitForCondition(fSendCondition, locker, maximum);
570 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
571 				break;
572 			else if (status < B_OK)
573 				return status;
574 		}
575 
576 		TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
577 			" bytes.", fSendQueue.Used());
578 	}
579 	return B_OK;
580 }
581 
582 
583 void
584 TCPEndpoint::Free()
585 {
586 	MutexLocker _(fLock);
587 
588 	TRACE("Free()");
589 	T(APICall(this, "free"));
590 
591 	if (fState <= SYNCHRONIZE_SENT)
592 		return;
593 
594 	fFlags |= FLAG_CLOSED;
595 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
596 		// we'll be freed later when the 2MSL timer expires
597 		gSocketModule->acquire_socket(socket);
598 
599 		// we are only interested in the timer, not in changing state
600 		_EnterTimeWait();
601 	}
602 }
603 
604 
605 /*!	Creates and sends a synchronize packet to /a address, and then waits
606 	until the connection has been established or refused.
607 */
608 status_t
609 TCPEndpoint::Connect(const sockaddr* address)
610 {
611 	if (!AddressModule()->is_same_family(address))
612 		return EAFNOSUPPORT;
613 
614 	MutexLocker locker(fLock);
615 
616 	TRACE("Connect() on address %s", PrintAddress(address));
617 	T(APICall(this, "connect"));
618 
619 	if (gStackModule->is_restarted_syscall()) {
620 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
621 		status_t status = _WaitForEstablished(locker, timeout);
622 		TRACE("  Connect(): Connection complete: %s (timeout was %"
623 			B_PRIdBIGTIME ")", strerror(status), timeout);
624 		return posix_error(status);
625 	}
626 
627 	// Can only call connect() from CLOSED or LISTEN states
628 	// otherwise endpoint is considered already connected
629 	if (fState == LISTEN) {
630 		// this socket is about to connect; remove pending connections in the backlog
631 		gSocketModule->set_max_backlog(socket, 0);
632 	} else if (fState == ESTABLISHED) {
633 		return EISCONN;
634 	} else if (fState != CLOSED)
635 		return EALREADY;
636 
637 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
638 	sockaddr_storage _address;
639 	if (AddressModule()->is_empty_address(address, false)) {
640 		AddressModule()->get_loopback_address((sockaddr *)&_address);
641 		// for IPv4 and IPv6 the port is at the same offset
642 		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
643 		address = (sockaddr *)&_address;
644 	}
645 
646 	status_t status = _PrepareSendPath(address);
647 	if (status < B_OK)
648 		return status;
649 
650 	TRACE("  Connect(): starting 3-way handshake...");
651 
652 	fState = SYNCHRONIZE_SENT;
653 	T(State(this));
654 
655 	// send SYN
656 	status = _SendAcknowledge();
657 	if (status != B_OK) {
658 		_Close();
659 		return status;
660 	}
661 
662 	// If we are running over Loopback, after _SendQueued() returns we
663 	// may be in ESTABLISHED already.
664 	if (fState == ESTABLISHED) {
665 		TRACE("  Connect() completed after _SendQueued()");
666 		return B_OK;
667 	}
668 
669 	// wait until 3-way handshake is complete (if needed)
670 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
671 	if (timeout == 0) {
672 		// we're a non-blocking socket
673 		TRACE("  Connect() delayed, return EINPROGRESS");
674 		return EINPROGRESS;
675 	}
676 
677 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
678 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
679 
680 	status = _WaitForEstablished(locker, absoluteTimeout);
681 	TRACE("  Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
682 		")", strerror(status), timeout);
683 	return posix_error(status);
684 }
685 
686 
687 status_t
688 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
689 {
690 	MutexLocker locker(fLock);
691 
692 	TRACE("Accept()");
693 	T(APICall(this, "accept"));
694 
695 	status_t status;
696 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
697 	if (gStackModule->is_restarted_syscall())
698 		timeout = gStackModule->restore_syscall_restart_timeout();
699 	else
700 		gStackModule->store_syscall_restart_timeout(timeout);
701 
702 	do {
703 		locker.Unlock();
704 
705 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
706 			| B_CAN_INTERRUPT, timeout);
707 		if (status != B_OK) {
708 			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
709 				return B_WOULD_BLOCK;
710 
711 			return status;
712 		}
713 
714 		locker.Lock();
715 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
716 #ifdef TRACE_TCP
717 		if (status == B_OK)
718 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
719 #endif
720 	} while (status != B_OK);
721 
722 	return status;
723 }
724 
725 
726 status_t
727 TCPEndpoint::Bind(const sockaddr *address)
728 {
729 	if (address == NULL)
730 		return B_BAD_VALUE;
731 
732 	MutexLocker lock(fLock);
733 
734 	TRACE("Bind() on address %s", PrintAddress(address));
735 	T(APICall(this, "bind"));
736 
737 	if (fState != CLOSED)
738 		return EISCONN;
739 
740 	return fManager->Bind(this, address);
741 }
742 
743 
744 status_t
745 TCPEndpoint::Unbind(struct sockaddr *address)
746 {
747 	MutexLocker _(fLock);
748 
749 	TRACE("Unbind()");
750 	T(APICall(this, "unbind"));
751 
752 	return fManager->Unbind(this);
753 }
754 
755 
756 status_t
757 TCPEndpoint::Listen(int count)
758 {
759 	MutexLocker _(fLock);
760 
761 	TRACE("Listen()");
762 	T(APICall(this, "listen"));
763 
764 	if (fState != CLOSED && fState != LISTEN)
765 		return B_BAD_VALUE;
766 
767 	if (fState == CLOSED) {
768 		fAcceptSemaphore = create_sem(0, "tcp accept");
769 		if (fAcceptSemaphore < B_OK)
770 			return ENOBUFS;
771 
772 		status_t status = fManager->SetPassive(this);
773 		if (status != B_OK) {
774 			delete_sem(fAcceptSemaphore);
775 			fAcceptSemaphore = -1;
776 			return status;
777 		}
778 	}
779 
780 	gSocketModule->set_max_backlog(socket, count);
781 
782 	fState = LISTEN;
783 	T(State(this));
784 	return B_OK;
785 }
786 
787 
788 status_t
789 TCPEndpoint::Shutdown(int direction)
790 {
791 	MutexLocker lock(fLock);
792 
793 	TRACE("Shutdown(%i)", direction);
794 	T(APICall(this, "shutdown"));
795 
796 	if (direction == SHUT_RD || direction == SHUT_RDWR)
797 		fFlags |= FLAG_NO_RECEIVE;
798 
799 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
800 		// TODO: That's not correct. After read/write shutting down the socket
801 		// one should still be able to read previously arrived data.
802 		_Disconnect(false);
803 	}
804 
805 	return B_OK;
806 }
807 
808 
809 /*!	Puts data contained in \a buffer into send buffer */
810 status_t
811 TCPEndpoint::SendData(net_buffer *buffer)
812 {
813 	MutexLocker lock(fLock);
814 
815 	TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
816 		") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
817 		buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
818 	T(APICall(this, "senddata"));
819 
820 	const uint32 flags = buffer->msg_flags;
821 	if ((flags & ~(MSG_DONTWAIT | MSG_OOB | MSG_EOF)) != 0)
822 		return EOPNOTSUPP;
823 
824 	if (fState == CLOSED)
825 		return ENOTCONN;
826 	if (fState == LISTEN)
827 		return EDESTADDRREQ;
828 	if (!is_writable(fState) && !is_establishing(fState))
829 		return EPIPE;
830 
831 	size_t left = buffer->size;
832 
833 	bigtime_t timeout = 0;
834 	if ((flags & MSG_DONTWAIT) == 0) {
835 		timeout = absolute_timeout(socket->send.timeout);
836 		if (gStackModule->is_restarted_syscall())
837 			timeout = gStackModule->restore_syscall_restart_timeout();
838 		else
839 			gStackModule->store_syscall_restart_timeout(timeout);
840 	}
841 
842 	while (left > 0) {
843 		while (fSendQueue.Free() < socket->send.low_water_mark) {
844 			// initiate a send before waiting
845 			_SendQueued();
846 
847 			// wait until enough space is available
848 			status_t status = _WaitForCondition(fSendCondition, lock, timeout);
849 			if (status < B_OK) {
850 				TRACE("  SendData() returning %s (%d)",
851 					strerror(posix_error(status)), (int)posix_error(status));
852 				return posix_error(status);
853 			}
854 
855 			if (!is_writable(fState) && !is_establishing(fState))
856 				return EPIPE;
857 		}
858 
859 		size_t size = fSendQueue.Free();
860 		if (size < left) {
861 			// we need to split the original buffer
862 			net_buffer* clone = gBufferModule->clone(buffer, false);
863 				// TODO: add offset/size parameter to net_buffer::clone() or
864 				// even a move_data() function, as this is a bit inefficient
865 			if (clone == NULL)
866 				return ENOBUFS;
867 
868 			status_t status = gBufferModule->trim(clone, size);
869 			if (status != B_OK) {
870 				gBufferModule->free(clone);
871 				return status;
872 			}
873 
874 			gBufferModule->remove_header(buffer, size);
875 			left -= size;
876 			fSendQueue.Add(clone);
877 		} else {
878 			left -= buffer->size;
879 			fSendQueue.Add(buffer);
880 		}
881 	}
882 
883 	TRACE("  SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
884 
885 	bool force = false;
886 	if ((flags & MSG_OOB) != 0) {
887 		fSendUrgentOffset = fSendQueue.LastSequence();
888 			// RFC 961 specifies that the urgent offset points to the last
889 			// byte of urgent data. However, this is commonly implemented as
890 			// here, ie. it points to the first byte after the urgent data.
891 		force = true;
892 	}
893 	if ((flags & MSG_EOF) != 0)
894 		_Disconnect(false);
895 
896 	_SendQueued(force);
897 
898 	return B_OK;
899 }
900 
901 
902 ssize_t
903 TCPEndpoint::SendAvailable()
904 {
905 	MutexLocker locker(fLock);
906 
907 	ssize_t available;
908 
909 	if (is_writable(fState))
910 		available = fSendQueue.Free();
911 	else if (is_establishing(fState))
912 		available = 0;
913 	else
914 		available = EPIPE;
915 
916 	TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
917 	T(APICall(this, "sendavailable"));
918 	return available;
919 }
920 
921 
922 status_t
923 TCPEndpoint::FillStat(net_stat *stat)
924 {
925 	MutexLocker _(fLock);
926 
927 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
928 	stat->receive_queue_size = fReceiveQueue.Available();
929 	stat->send_queue_size = fSendQueue.Used();
930 
931 	return B_OK;
932 }
933 
934 
935 status_t
936 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
937 {
938 	if ((flags & ~(MSG_DONTWAIT | MSG_WAITALL | MSG_PEEK)) != 0)
939 		return EOPNOTSUPP;
940 
941 	MutexLocker locker(fLock);
942 
943 	TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
944 		flags);
945 	T(APICall(this, "readdata"));
946 
947 	*_buffer = NULL;
948 
949 	if (fState == CLOSED || fState == LISTEN) {
950 		if (socket->error != B_OK)
951 			return socket->error;
952 		return ENOTCONN;
953 	}
954 
955 	bigtime_t timeout = 0;
956 	if ((flags & MSG_DONTWAIT) == 0) {
957 		timeout = absolute_timeout(socket->receive.timeout);
958 		if (gStackModule->is_restarted_syscall())
959 			timeout = gStackModule->restore_syscall_restart_timeout();
960 		else
961 			gStackModule->store_syscall_restart_timeout(timeout);
962 	}
963 
964 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
965 		if (flags & MSG_DONTWAIT)
966 			return B_WOULD_BLOCK;
967 
968 		status_t status = _WaitForEstablished(locker, timeout);
969 		if (status < B_OK)
970 			return posix_error(status);
971 	}
972 
973 	size_t dataNeeded = socket->receive.low_water_mark;
974 
975 	// When MSG_WAITALL is set then the function should block
976 	// until the full amount of data can be returned.
977 	if (flags & MSG_WAITALL)
978 		dataNeeded = numBytes;
979 
980 	// TODO: add support for urgent data (MSG_OOB)
981 
982 	while (true) {
983 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
984 			|| fState == TIME_WAIT) {
985 			// ``Connection closing''.
986 			if (fReceiveQueue.Available() > 0)
987 				break;
988 			return B_OK;
989 		}
990 
991 		if (fReceiveQueue.Available() > 0) {
992 			if (fReceiveQueue.Available() >= dataNeeded
993 				|| (fReceiveQueue.PushedData() > 0
994 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
995 				break;
996 		} else if (fState == FINISH_RECEIVED) {
997 			// ``If no text is awaiting delivery, the RECEIVE will
998 			//   get a Connection closing''.
999 			return B_OK;
1000 		}
1001 
1002 		if (timeout == 0)
1003 			return B_WOULD_BLOCK;
1004 
1005 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1006 			return B_OK;
1007 
1008 		status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
1009 		if (status < B_OK) {
1010 			// The Open Group base specification mentions that EINTR should be
1011 			// returned if the recv() is interrupted before _any data_ is
1012 			// available. So we actually check if there is data, and if so,
1013 			// push it to the user.
1014 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
1015 				&& fReceiveQueue.Available() > 0)
1016 				break;
1017 
1018 			return posix_error(status);
1019 		}
1020 	}
1021 
1022 	TRACE("  ReadData(): %" B_PRIuSIZE " are available.",
1023 		fReceiveQueue.Available());
1024 
1025 	if (numBytes < fReceiveQueue.Available())
1026 		fReceiveCondition.NotifyAll();
1027 
1028 	bool clone = (flags & MSG_PEEK) != 0;
1029 
1030 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
1031 
1032 	TRACE("  ReadData(): %" B_PRIuSIZE " bytes kept.",
1033 		fReceiveQueue.Available());
1034 
1035 	if (fReceiveQueue.Available() == 0 && fState == FINISH_RECEIVED)
1036 		socket->receive.low_water_mark = 0;
1037 
1038 	// if we opened the window, check if we should send a window update
1039 	if (!clone) {
1040 		// Only send if there's less than half the window size remaining.
1041 		// TODO: This should use fReceiveWindow but that stays constant at present
1042 		// due to another TODO.
1043 		uint32 windowSizeRemaining = (fReceiveMaxAdvertised - fReceiveNext).Number();
1044 		if (fReceiveNext == (fReceiveMaxAdvertised + 1))
1045 			windowSizeRemaining = 0;
1046 		if (windowSizeRemaining <= (fReceiveQueue.Size() / 2))
1047 			_SendAcknowledge();
1048 	}
1049 
1050 	return receivedBytes;
1051 }
1052 
1053 
1054 ssize_t
1055 TCPEndpoint::ReadAvailable()
1056 {
1057 	MutexLocker locker(fLock);
1058 
1059 	TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1060 	T(APICall(this, "readavailable"));
1061 
1062 	return _AvailableData();
1063 }
1064 
1065 
1066 status_t
1067 TCPEndpoint::SetSendBufferSize(size_t length)
1068 {
1069 	MutexLocker _(fLock);
1070 	fSendQueue.SetMaxBytes(length);
1071 	return B_OK;
1072 }
1073 
1074 
1075 status_t
1076 TCPEndpoint::SetReceiveBufferSize(size_t length)
1077 {
1078 	MutexLocker _(fLock);
1079 	fReceiveQueue.SetMaxBytes(length);
1080 	fFlags &= ~FLAG_AUTO_RECEIVE_BUFFER_SIZE;
1081 	return B_OK;
1082 }
1083 
1084 
1085 status_t
1086 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1087 {
1088 	if (*_length != sizeof(int))
1089 		return B_BAD_VALUE;
1090 
1091 	int* value = (int*)_value;
1092 
1093 	switch (option) {
1094 		case TCP_NODELAY:
1095 			if ((fOptions & TCP_NODELAY) != 0)
1096 				*value = 1;
1097 			else
1098 				*value = 0;
1099 			return B_OK;
1100 
1101 		case TCP_MAXSEG:
1102 			*value = fReceiveMaxSegmentSize;
1103 			return B_OK;
1104 
1105 		default:
1106 			return B_BAD_VALUE;
1107 	}
1108 }
1109 
1110 
1111 status_t
1112 TCPEndpoint::SetOption(int option, const void* _value, int length)
1113 {
1114 	if (option != TCP_NODELAY)
1115 		return B_BAD_VALUE;
1116 
1117 	if (length != sizeof(int))
1118 		return B_BAD_VALUE;
1119 
1120 	const int* value = (const int*)_value;
1121 
1122 	MutexLocker _(fLock);
1123 	if (*value)
1124 		fOptions |= TCP_NODELAY;
1125 	else
1126 		fOptions &= ~TCP_NODELAY;
1127 
1128 	return B_OK;
1129 }
1130 
1131 
1132 //	#pragma mark - misc
1133 
1134 
1135 bool
1136 TCPEndpoint::IsBound() const
1137 {
1138 	return !LocalAddress().IsEmpty(true);
1139 }
1140 
1141 
1142 bool
1143 TCPEndpoint::IsLocal() const
1144 {
1145 	return (fFlags & FLAG_LOCAL) != 0;
1146 }
1147 
1148 
1149 status_t
1150 TCPEndpoint::DelayedAcknowledge()
1151 {
1152 	// ACKs "MUST" be generated within 500ms of the first unACKed packet, and
1153 	// "SHOULD" be for at least every second full-size segment. (RFC 5681 § 4.2)
1154 
1155 	bigtime_t delay = TCP_DELAYED_ACKNOWLEDGE_TIMEOUT;
1156 	if ((fReceiveNext - fLastAcknowledgeSent) >= (fReceiveMaxSegmentSize * 2)) {
1157 		// Trigger an immediate timeout rather than invoking Send directly,
1158 		// allowing multiple invocations to be coalesced.
1159 		delay = 0;
1160 	} else if (gStackModule->is_timer_active(&fDelayedAcknowledgeTimer)) {
1161 		return B_OK;
1162 	}
1163 
1164 	gStackModule->set_timer(&fDelayedAcknowledgeTimer, delay);
1165 	T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1166 	return B_OK;
1167 }
1168 
1169 
1170 void
1171 TCPEndpoint::_StartPersistTimer()
1172 {
1173 	gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1174 	T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1175 }
1176 
1177 
1178 void
1179 TCPEndpoint::_EnterTimeWait()
1180 {
1181 	TRACE("_EnterTimeWait()");
1182 
1183 	if (fState == TIME_WAIT)
1184 		_CancelConnectionTimers();
1185 
1186 	_UpdateTimeWait();
1187 }
1188 
1189 
1190 void
1191 TCPEndpoint::_UpdateTimeWait()
1192 {
1193 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1194 	T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1195 }
1196 
1197 
1198 void
1199 TCPEndpoint::_CancelConnectionTimers()
1200 {
1201 	gStackModule->cancel_timer(&fRetransmitTimer);
1202 	T(TimerSet(this, "retransmit", -1));
1203 	gStackModule->cancel_timer(&fPersistTimer);
1204 	T(TimerSet(this, "persist", -1));
1205 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1206 	T(TimerSet(this, "delayed ack", -1));
1207 }
1208 
1209 
1210 /*!	Sends the FIN flag to the peer when the connection is still open.
1211 	Moves the endpoint to the next state depending on where it was.
1212 */
1213 status_t
1214 TCPEndpoint::_Disconnect(bool closing)
1215 {
1216 	tcp_state previousState = fState;
1217 
1218 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1219 		fState = FINISH_SENT;
1220 	else if (fState == FINISH_RECEIVED)
1221 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1222 	else
1223 		return B_OK;
1224 
1225 	T(State(this));
1226 
1227 	status_t status = _SendQueued();
1228 	if (status != B_OK) {
1229 		fState = previousState;
1230 		T(State(this));
1231 		return status;
1232 	}
1233 
1234 	return B_OK;
1235 }
1236 
1237 
1238 void
1239 TCPEndpoint::_MarkEstablished()
1240 {
1241 	fState = ESTABLISHED;
1242 	T(State(this));
1243 
1244 	gSocketModule->set_connected(socket);
1245 	if (gSocketModule->has_parent(socket))
1246 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1247 
1248 	fSendCondition.NotifyAll();
1249 	gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1250 }
1251 
1252 
1253 status_t
1254 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1255 {
1256 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1257 	// When investigating this, also have a look at _Close() and _HandleReset().
1258 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1259 		if (socket->error != B_OK)
1260 			return socket->error;
1261 
1262 		status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1263 		if (status < B_OK)
1264 			return status;
1265 	}
1266 
1267 	return B_OK;
1268 }
1269 
1270 
1271 //	#pragma mark - receive
1272 
1273 
1274 void
1275 TCPEndpoint::_Close()
1276 {
1277 	_CancelConnectionTimers();
1278 	fState = CLOSED;
1279 	T(State(this));
1280 
1281 	fFlags |= FLAG_DELETE_ON_CLOSE;
1282 
1283 	fSendCondition.NotifyAll();
1284 	_NotifyReader();
1285 
1286 	if (gSocketModule->has_parent(socket)) {
1287 		// We still have a parent - obviously, we haven't been accepted yet,
1288 		// so no one could ever close us.
1289 		_CancelConnectionTimers();
1290 		gSocketModule->set_aborted(socket);
1291 	}
1292 }
1293 
1294 
1295 void
1296 TCPEndpoint::_HandleReset(status_t error)
1297 {
1298 	socket->error = error;
1299 	_Close();
1300 
1301 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1302 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1303 }
1304 
1305 
1306 void
1307 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1308 {
1309 	if (fDuplicateAcknowledgeCount == 0)
1310 		fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number();
1311 
1312 	if (++fDuplicateAcknowledgeCount < 3) {
1313 		if (fSendQueue.Available(fSendMax) != 0 && fSendWindow != 0) {
1314 			fSendNext = fSendMax;
1315 			fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1316 			_SendQueued();
1317 			TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack");
1318 			fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1319 		}
1320 	}
1321 
1322 	if (fDuplicateAcknowledgeCount == 3) {
1323 		if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize &&
1324 			(fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) {
1325 			fFlags |= FLAG_RECOVERY;
1326 			fRecover = fSendMax.Number() - 1;
1327 			fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize);
1328 			fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1329 			fSendNext = segment.acknowledge;
1330 			_SendQueued();
1331 			TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack");
1332 		}
1333 	} else if (fDuplicateAcknowledgeCount > 3) {
1334 		uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1335 		if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize)
1336 			fCongestionWindow += fSendMaxSegmentSize;
1337 		if (fSendQueue.Available(fSendMax) != 0) {
1338 			fSendNext = fSendMax;
1339 			_SendQueued();
1340 		}
1341 	}
1342 }
1343 
1344 
1345 void
1346 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1347 	size_t segmentLength)
1348 {
1349 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1350 		tcp_sequence sequence(segment.sequence);
1351 
1352 		if (fLastAcknowledgeSent >= sequence
1353 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1354 			fReceivedTimestamp = segment.timestamp_value;
1355 	}
1356 }
1357 
1358 
1359 void
1360 TCPEndpoint::_UpdateReceiveBuffer()
1361 {
1362 	if (fReceiveWindowShift == 0 || fSmoothedRoundTripTime < 0)
1363 		return;
1364 	if ((fFlags & FLAG_AUTO_RECEIVE_BUFFER_SIZE) == 0)
1365 		return;
1366 
1367 	const uint64 maxWindowSize = UINT16_MAX << fReceiveWindowShift;
1368 	if (fReceiveQueue.Size() == maxWindowSize) {
1369 		// The window is already as large as it could be.
1370 		return;
1371 	}
1372 
1373 	if (fReceiveSizingTimestamp == 0) {
1374 		fReceiveSizingTimestamp = tcp_now();
1375 		fReceiveSizingReference = fReceiveNext;
1376 		return;
1377 	}
1378 
1379 	const uint32 received = (fReceiveNext - fReceiveSizingReference).Number();
1380 	if (received < (fReceiveQueue.Size() / 2))
1381 		return;
1382 
1383 	const uint32 since = tcp_diff_timestamp(fReceiveSizingTimestamp);
1384 	if (since == 0 || since < ((uint32)fSmoothedRoundTripTime * 2))
1385 		return;
1386 
1387 	fReceiveSizingTimestamp = tcp_now();
1388 	fReceiveSizingReference = fReceiveNext;
1389 
1390 	// TODO: add low_resource hook to reduce window sizes.
1391 	if (low_resource_state(B_KERNEL_RESOURCE_MEMORY) != B_NO_LOW_RESOURCE)
1392 		return;
1393 
1394 	// Target a window large enough to fit one second of traffic.
1395 	const uint64 oneSecondReceive = (received * 1000LL) / since;
1396 	if (fReceiveQueue.Size() >= oneSecondReceive)
1397 		return;
1398 
1399 	// Don't increase the window size too rapidly.
1400 	uint32 newWindowSize = min_c(oneSecondReceive, UINT32_MAX);
1401 	if (newWindowSize > (fReceiveQueue.Size() * 2))
1402 		newWindowSize = fReceiveQueue.Size() * 2;
1403 
1404 	// Round to the window shift and max segment size.
1405 	uint32 newWindowShifted = newWindowSize >> fReceiveWindowShift;
1406 	const uint32 maxSegmentShifted = fReceiveMaxSegmentSize >> fReceiveWindowShift;
1407 	newWindowShifted = (newWindowShifted / maxSegmentShifted) * maxSegmentShifted;
1408 	newWindowSize = newWindowShifted << fReceiveWindowShift;
1409 
1410 	if (newWindowSize > maxWindowSize)
1411 		newWindowSize = maxWindowSize;
1412 	if (fReceiveQueue.Size() >= newWindowSize)
1413 		return;
1414 
1415 	fReceiveQueue.SetMaxBytes(newWindowSize);
1416 	TRACE("TCPEndpoint: updated receive buffer size to %" B_PRIu32 "\n", newWindowSize);
1417 }
1418 
1419 
1420 ssize_t
1421 TCPEndpoint::_AvailableData() const
1422 {
1423 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1424 	//       the application of FLAG_NO_RECEIVE in listen()ing
1425 	//       sockets.
1426 	if (fState == LISTEN)
1427 		return gSocketModule->count_connected(socket);
1428 	if (fState == SYNCHRONIZE_SENT)
1429 		return 0;
1430 
1431 	ssize_t availableData = fReceiveQueue.Available();
1432 
1433 	if (availableData == 0 && !_ShouldReceive())
1434 		return ENOTCONN;
1435 	if (availableData == 0 && (fState == FINISH_RECEIVED || fState == WAIT_FOR_FINISH_ACKNOWLEDGE))
1436 		return ESHUTDOWN;
1437 	return availableData;
1438 }
1439 
1440 
1441 void
1442 TCPEndpoint::_NotifyReader()
1443 {
1444 	fReceiveCondition.NotifyAll();
1445 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1446 }
1447 
1448 
1449 bool
1450 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1451 {
1452 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1453 		// Remember the position of the finish received flag
1454 		fFinishReceived = true;
1455 		fFinishReceivedAt = segment.sequence + buffer->size;
1456 	}
1457 
1458 	fReceiveQueue.Add(buffer, segment.sequence);
1459 	fReceiveNext = fReceiveQueue.NextSequence();
1460 
1461 	if (fFinishReceived) {
1462 		// Set or reset the finish flag on the current segment
1463 		if (fReceiveNext < fFinishReceivedAt)
1464 			segment.flags &= ~TCP_FLAG_FINISH;
1465 		else
1466 			segment.flags |= TCP_FLAG_FINISH;
1467 	}
1468 
1469 	TRACE("  _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1470 		B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1471 
1472 	_UpdateReceiveBuffer();
1473 
1474 	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1475 		fReceiveQueue.SetPushPointer();
1476 
1477 	return fReceiveQueue.Available() > 0;
1478 }
1479 
1480 
1481 void
1482 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1483 {
1484 	fInitialReceiveSequence = segment.sequence;
1485 	fFinishReceived = false;
1486 	fReceiveSizingTimestamp = 0;
1487 
1488 	// count the received SYN
1489 	segment.sequence++;
1490 
1491 	fReceiveNext = segment.sequence;
1492 	fReceiveQueue.SetInitialSequence(segment.sequence);
1493 
1494 	if ((fOptions & TCP_NOOPT) == 0) {
1495 		if (segment.max_segment_size > 0) {
1496 			// The maximum size of a segment that a TCP endpoint really sends,
1497 			// the "effective send MSS", MUST be the smaller of the send MSS and
1498 			// the largest transmission size permitted by the IP layer:
1499 			fSendMaxSegmentSize = min_c(segment.max_segment_size,
1500 				_MaxSegmentSize(*PeerAddress()));
1501 		}
1502 
1503 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1504 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1505 			fSendWindowShift = segment.window_shift;
1506 		} else {
1507 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1508 			fReceiveWindowShift = 0;
1509 		}
1510 
1511 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1512 			fFlags |= FLAG_OPTION_TIMESTAMP;
1513 			fReceivedTimestamp = segment.timestamp_value;
1514 		} else
1515 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1516 
1517 		if ((segment.options & TCP_SACK_PERMITTED) == 0) {
1518 			fFlags &= ~FLAG_OPTION_SACK_PERMITTED;
1519 			fFlags &= ~FLAG_AUTO_RECEIVE_BUFFER_SIZE;
1520 		}
1521 	}
1522 
1523 	if (fSendMaxSegmentSize > 2190)
1524 		fCongestionWindow = 2 * fSendMaxSegmentSize;
1525 	else if (fSendMaxSegmentSize > 1095)
1526 		fCongestionWindow = 3 * fSendMaxSegmentSize;
1527 	else
1528 		fCongestionWindow = 4 * fSendMaxSegmentSize;
1529 
1530 	fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize;
1531 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1532 }
1533 
1534 
1535 bool
1536 TCPEndpoint::_ShouldReceive() const
1537 {
1538 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1539 		return false;
1540 
1541 	return fState == ESTABLISHED || fState == FINISH_SENT
1542 		|| fState == FINISH_ACKNOWLEDGED || fState == FINISH_RECEIVED;
1543 }
1544 
1545 
1546 int32
1547 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1548 	net_buffer* buffer)
1549 {
1550 	MutexLocker _(fLock);
1551 
1552 	TRACE("Spawn()");
1553 
1554 	// TODO: proper error handling!
1555 	if (ProtocolSocket::Open() != B_OK) {
1556 		T(Error(this, "opening failed", __LINE__));
1557 		return DROP;
1558 	}
1559 
1560 	fState = SYNCHRONIZE_RECEIVED;
1561 	T(Spawn(parent, this));
1562 
1563 	fManager = parent->fManager;
1564 
1565 	if (fManager->BindChild(this, buffer->destination) != B_OK) {
1566 		T(Error(this, "binding failed", __LINE__));
1567 		return DROP;
1568 	}
1569 	if (_PrepareSendPath(buffer->source) != B_OK) {
1570 		T(Error(this, "prepare send faild", __LINE__));
1571 		return DROP;
1572 	}
1573 
1574 	fOptions = parent->fOptions;
1575 	fAcceptSemaphore = parent->fAcceptSemaphore;
1576 
1577 	_PrepareReceivePath(segment);
1578 
1579 	// send SYN+ACK
1580 	if (_SendAcknowledge() != B_OK) {
1581 		T(Error(this, "sending failed", __LINE__));
1582 		return DROP;
1583 	}
1584 
1585 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1586 		// we handled this flag now, it must not be set for further processing
1587 
1588 	return _Receive(segment, buffer);
1589 }
1590 
1591 
1592 int32
1593 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1594 {
1595 	TRACE("ListenReceive()");
1596 
1597 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1598 	// but the error behaviour differs
1599 	if (segment.flags & TCP_FLAG_RESET)
1600 		return DROP;
1601 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1602 		return DROP | RESET;
1603 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1604 		return DROP;
1605 
1606 	// TODO: drop broadcast/multicast
1607 
1608 	// spawn new endpoint for accept()
1609 	net_socket* newSocket;
1610 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1611 		T(Error(this, "spawning failed", __LINE__));
1612 		return DROP;
1613 	}
1614 
1615 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1616 		segment, buffer);
1617 }
1618 
1619 
1620 int32
1621 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1622 	net_buffer *buffer)
1623 {
1624 	TRACE("_SynchronizeSentReceive()");
1625 
1626 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1627 		&& (fInitialSendSequence >= segment.acknowledge
1628 			|| fSendMax < segment.acknowledge))
1629 		return DROP | RESET;
1630 
1631 	if (segment.flags & TCP_FLAG_RESET) {
1632 		_HandleReset(ECONNREFUSED);
1633 		return DROP;
1634 	}
1635 
1636 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1637 		return DROP;
1638 
1639 	fSendUnacknowledged = segment.acknowledge;
1640 	_PrepareReceivePath(segment);
1641 
1642 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1643 		_MarkEstablished();
1644 	} else {
1645 		// simultaneous open
1646 		fState = SYNCHRONIZE_RECEIVED;
1647 		T(State(this));
1648 	}
1649 
1650 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1651 		// we handled this flag now, it must not be set for further processing
1652 
1653 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1654 }
1655 
1656 
1657 int32
1658 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1659 {
1660 	// PAWS processing takes precedence over regular TCP acceptability check
1661 	if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) {
1662 		if ((segment.options & TCP_HAS_TIMESTAMPS) == 0)
1663 			return DROP;
1664 		if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0
1665 			&& (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX)
1666 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1667 	}
1668 
1669 	uint32 advertisedWindow = segment.AdvertisedWindow(fSendWindowShift);
1670 	size_t segmentLength = buffer->size;
1671 
1672 	// First, handle the most common case for uni-directional data transfer
1673 	// (known as header prediction - the segment must not change the window,
1674 	// and must be the expected sequence, and contain no control flags)
1675 
1676 	if (fState == ESTABLISHED
1677 		&& segment.AcknowledgeOnly()
1678 		&& fReceiveNext == segment.sequence
1679 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1680 		&& fSendNext == fSendMax) {
1681 		_UpdateTimestamps(segment, segmentLength);
1682 
1683 		if (segmentLength == 0) {
1684 			// this is a pure acknowledge segment - we're on the sending end
1685 			if (fSendUnacknowledged < segment.acknowledge
1686 				&& fSendMax >= segment.acknowledge) {
1687 				_Acknowledged(segment);
1688 				return DROP;
1689 			}
1690 		} else if (segment.acknowledge == fSendUnacknowledged
1691 			&& fReceiveQueue.IsContiguous()
1692 			&& fReceiveQueue.Free() >= segmentLength
1693 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1694 			if (_AddData(segment, buffer))
1695 				_NotifyReader();
1696 
1697 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1698 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1699 		}
1700 	}
1701 
1702 	// The fast path was not applicable, so we continue with the standard
1703 	// processing of the incoming segment
1704 
1705 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1706 
1707 	if (fState != CLOSED && fState != TIME_WAIT) {
1708 		// Check sequence number
1709 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1710 				fReceiveWindow)) {
1711 			TRACE("  Receive(): segment out of window, next: %" B_PRIu32
1712 				" wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1713 			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1714 				// TODO: this doesn't look right - review!
1715 				return DROP;
1716 			}
1717 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1718 		}
1719 	}
1720 
1721 	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1722 		// Is this a valid reset?
1723 		// We generally ignore resets in time wait state (see RFC 1337)
1724 		if (fLastAcknowledgeSent <= segment.sequence
1725 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1726 				+ fReceiveWindow)
1727 			&& fState != TIME_WAIT) {
1728 			status_t error;
1729 			if (fState == SYNCHRONIZE_RECEIVED)
1730 				error = ECONNREFUSED;
1731 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1732 				error = ENOTCONN;
1733 			else
1734 				error = ECONNRESET;
1735 
1736 			_HandleReset(error);
1737 		}
1738 
1739 		return DROP;
1740 	}
1741 
1742 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1743 		|| (fState == SYNCHRONIZE_RECEIVED
1744 			&& (fInitialReceiveSequence > segment.sequence
1745 				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1746 					&& (fSendUnacknowledged > segment.acknowledge
1747 						|| fSendMax < segment.acknowledge))))) {
1748 		// reset the connection - either the initial SYN was faulty, or we
1749 		// received a SYN within the data stream
1750 		return DROP | RESET;
1751 	}
1752 
1753 	// TODO: Check this! Why do we advertize a window outside of what we should
1754 	// buffer?
1755 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1756 		// the window must not shrink
1757 
1758 	// trim buffer to be within the receive window
1759 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1760 	if (drop > 0) {
1761 		if ((uint32)drop > buffer->size
1762 			|| ((uint32)drop == buffer->size
1763 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1764 			// don't accidently remove a FIN we shouldn't remove
1765 			segment.flags &= ~TCP_FLAG_FINISH;
1766 			drop = buffer->size;
1767 		}
1768 
1769 		// remove duplicate data at the start
1770 		TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1771 		gBufferModule->remove_header(buffer, drop);
1772 		segment.sequence += drop;
1773 	}
1774 
1775 	int32 action = KEEP;
1776 
1777 	// Immediately acknowledge out-of-order segments, as well as any
1778 	// in-order segments following out-of-order segments, to trigger
1779 	// fast-retransmit and fast-recovery at the sender.
1780 	if (drop != 0 || (drop == 0 && !fReceiveQueue.IsContiguous()))
1781 		action |= IMMEDIATE_ACKNOWLEDGE;
1782 
1783 	drop = (int32)(segment.sequence + buffer->size
1784 		- (fReceiveNext + fReceiveWindow)).Number();
1785 	if (drop > 0) {
1786 		// remove data exceeding our window
1787 		if ((uint32)drop >= buffer->size) {
1788 			// if we can accept data, or the segment is not what we'd expect,
1789 			// drop the segment (an immediate acknowledge is always triggered)
1790 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1791 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1792 
1793 			action |= IMMEDIATE_ACKNOWLEDGE;
1794 		}
1795 
1796 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1797 			// we need to remove the finish, too, as part of the data
1798 			drop--;
1799 		}
1800 
1801 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1802 		TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1803 		gBufferModule->remove_trailer(buffer, drop);
1804 	}
1805 
1806 #ifdef TRACE_TCP
1807 	if (advertisedWindow > fSendWindow) {
1808 		TRACE("  Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1809 			fSendWindow, advertisedWindow);
1810 	}
1811 #endif
1812 
1813 	if (fSendWindow < fSendMaxSegmentSize
1814 			&& (advertisedWindow >= fSendMaxSegmentSize
1815 				|| advertisedWindow > (TCP_DEFAULT_MAX_SEGMENT_SIZE * 3))) {
1816 		// Our current send window is less than a segment wide, and the new one
1817 		// is larger, so trigger a send in case there's anything to be sent.
1818 		action |= SEND_QUEUED;
1819 	}
1820 
1821 	fSendWindow = advertisedWindow;
1822 	if (advertisedWindow > fSendMaxWindow)
1823 		fSendMaxWindow = advertisedWindow;
1824 
1825 	// Then look at the acknowledgement for any updates
1826 
1827 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1828 		// process acknowledged data
1829 		if (fState == SYNCHRONIZE_RECEIVED)
1830 			_MarkEstablished();
1831 
1832 		if (fSendMax < segment.acknowledge)
1833 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1834 
1835 		if (segment.acknowledge == fSendUnacknowledged) {
1836 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1837 				&& (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) {
1838 				TRACE("Receive(): duplicate ack!");
1839 				_DuplicateAcknowledge(segment);
1840 			}
1841 		} else if (segment.acknowledge < fSendUnacknowledged) {
1842 			return DROP;
1843 		} else {
1844 			// this segment acknowledges in flight data
1845 
1846 			if (fDuplicateAcknowledgeCount >= 3) {
1847 				// deflate the window.
1848 				if (segment.acknowledge > fRecover) {
1849 					uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1850 					fCongestionWindow = min_c(fSlowStartThreshold,
1851 						max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize);
1852 					fFlags &= ~FLAG_RECOVERY;
1853 				}
1854 			}
1855 
1856 			if (fSendMax == segment.acknowledge)
1857 				TRACE("Receive(): all inflight data ack'd!");
1858 
1859 			if (segment.acknowledge > fSendQueue.LastSequence()
1860 					&& fState > ESTABLISHED) {
1861 				TRACE("Receive(): FIN has been acknowledged!");
1862 
1863 				switch (fState) {
1864 					case FINISH_SENT:
1865 						fState = FINISH_ACKNOWLEDGED;
1866 						T(State(this));
1867 						break;
1868 					case CLOSING:
1869 						fState = TIME_WAIT;
1870 						T(State(this));
1871 						_EnterTimeWait();
1872 						return DROP;
1873 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1874 						_Close();
1875 						break;
1876 
1877 					default:
1878 						break;
1879 				}
1880 			}
1881 
1882 			if (fState != CLOSED) {
1883 				tcp_sequence last = fLastAcknowledgeSent;
1884 				_Acknowledged(segment);
1885 				// we just sent an acknowledge, remove from action
1886 				if (last < fLastAcknowledgeSent)
1887 					action &= ~IMMEDIATE_ACKNOWLEDGE;
1888 			}
1889 		}
1890 	}
1891 
1892 	if (segment.flags & TCP_FLAG_URGENT) {
1893 		if (fState == ESTABLISHED || fState == FINISH_SENT
1894 			|| fState == FINISH_ACKNOWLEDGED) {
1895 			// TODO: Handle urgent data:
1896 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1897 			//  - signal the user that urgent data is available (SIGURG)
1898 		}
1899 	}
1900 
1901 	bool notify = false;
1902 
1903 	// The buffer may be freed if its data is added to the queue, so cache
1904 	// the size as we still need it later.
1905 	const uint32 bufferSize = buffer->size;
1906 
1907 	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1908 		&& _ShouldReceive())
1909 		notify = _AddData(segment, buffer);
1910 	else {
1911 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1912 			fReceiveNext += buffer->size;
1913 
1914 		action = (action & ~KEEP) | DROP;
1915 	}
1916 
1917 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1918 		segmentLength++;
1919 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1920 			TRACE("Receive(): peer is finishing connection!");
1921 			fReceiveNext++;
1922 			notify = true;
1923 
1924 			// FIN implies PUSH
1925 			fReceiveQueue.SetPushPointer();
1926 
1927 			// we'll reply immediately to the FIN if we are not
1928 			// transitioning to TIME WAIT so we immediatly ACK it.
1929 			action |= IMMEDIATE_ACKNOWLEDGE;
1930 
1931 			// other side is closing connection; change states
1932 			switch (fState) {
1933 				case ESTABLISHED:
1934 				case SYNCHRONIZE_RECEIVED:
1935 					fState = FINISH_RECEIVED;
1936 					T(State(this));
1937 					break;
1938 				case FINISH_SENT:
1939 					// simultaneous close
1940 					fState = CLOSING;
1941 					T(State(this));
1942 					break;
1943 				case FINISH_ACKNOWLEDGED:
1944 					fState = TIME_WAIT;
1945 					T(State(this));
1946 					_EnterTimeWait();
1947 					break;
1948 				case TIME_WAIT:
1949 					_UpdateTimeWait();
1950 					break;
1951 
1952 				default:
1953 					break;
1954 			}
1955 		}
1956 	}
1957 
1958 	if (notify)
1959 		_NotifyReader();
1960 
1961 	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1962 		action |= ACKNOWLEDGE;
1963 
1964 	_UpdateTimestamps(segment, segmentLength);
1965 
1966 	TRACE("Receive() Action %" B_PRId32, action);
1967 
1968 	return action;
1969 }
1970 
1971 
1972 int32
1973 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1974 {
1975 	MutexLocker locker(fLock);
1976 
1977 	TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1978 		"to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1979 		", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1980 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1981 		segment.acknowledge,
1982 		(uint32)segment.advertised_window << fSendWindowShift);
1983 	T(Receive(this, segment,
1984 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1985 	int32 segmentAction = DROP;
1986 
1987 	switch (fState) {
1988 		case LISTEN:
1989 			segmentAction = _ListenReceive(segment, buffer);
1990 			break;
1991 
1992 		case SYNCHRONIZE_SENT:
1993 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1994 			break;
1995 
1996 		case SYNCHRONIZE_RECEIVED:
1997 		case ESTABLISHED:
1998 		case FINISH_RECEIVED:
1999 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
2000 		case FINISH_SENT:
2001 		case FINISH_ACKNOWLEDGED:
2002 		case CLOSING:
2003 		case TIME_WAIT:
2004 		case CLOSED:
2005 			segmentAction = _Receive(segment, buffer);
2006 			break;
2007 	}
2008 
2009 	// process acknowledge action as asked for by the *Receive() method
2010 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
2011 		_SendAcknowledge(true);
2012 	else if (segmentAction & ACKNOWLEDGE)
2013 		DelayedAcknowledge();
2014 
2015 	if (segmentAction & SEND_QUEUED)
2016 		_SendQueued();
2017 
2018 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
2019 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
2020 
2021 		locker.Unlock();
2022 		if (gSocketModule->release_socket(socket))
2023 			segmentAction |= DELETED_ENDPOINT;
2024 	}
2025 
2026 	return segmentAction;
2027 }
2028 
2029 
2030 //	#pragma mark - send
2031 
2032 
2033 tcp_segment_header
2034 TCPEndpoint::_PrepareSendSegment()
2035 {
2036 	// we don't set FLAG_FINISH here, instead we do it
2037 	// conditionally below depending if we are sending
2038 	// the last bytes of the send queue.
2039 
2040 	uint8 flags = 0;
2041 	switch (fState) {
2042 		case CLOSED:
2043 			flags = TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
2044 			break;
2045 
2046 		case SYNCHRONIZE_SENT:
2047 			flags = TCP_FLAG_SYNCHRONIZE;
2048 			break;
2049 
2050 		case SYNCHRONIZE_RECEIVED:
2051 			flags = TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
2052 			break;
2053 
2054 		case ESTABLISHED:
2055 		case FINISH_RECEIVED:
2056 		case FINISH_ACKNOWLEDGED:
2057 		case TIME_WAIT:
2058 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
2059 		case FINISH_SENT:
2060 		case CLOSING:
2061 			flags = TCP_FLAG_ACKNOWLEDGE;
2062 
2063 		default:
2064 			break;
2065 	}
2066 
2067 	tcp_segment_header segment(flags);
2068 
2069 	if ((fOptions & TCP_NOOPT) == 0) {
2070 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
2071 			segment.options |= TCP_HAS_TIMESTAMPS;
2072 			segment.timestamp_reply = fReceivedTimestamp;
2073 			segment.timestamp_value = tcp_now();
2074 		}
2075 
2076 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
2077 				&& fSendNext == fInitialSendSequence) {
2078 			// add connection establishment options
2079 			segment.max_segment_size = fReceiveMaxSegmentSize;
2080 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
2081 				segment.options |= TCP_HAS_WINDOW_SCALE;
2082 				segment.window_shift = fReceiveWindowShift;
2083 			}
2084 			if ((fFlags & FLAG_OPTION_SACK_PERMITTED) != 0)
2085 				segment.options |= TCP_SACK_PERMITTED;
2086 		}
2087 
2088 		if (!fReceiveQueue.IsContiguous()
2089 				&& (fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) {
2090 			segment.options |= TCP_HAS_SACK;
2091 			int maxSackCount = MAX_SACK_BLKS
2092 				- (((fFlags & FLAG_OPTION_TIMESTAMP) != 0) ? 1 : 0);
2093 			memset(segment.sacks, 0, sizeof(segment.sacks));
2094 			segment.sackCount = fReceiveQueue.PopulateSackInfo(fReceiveNext,
2095 				maxSackCount, segment.sacks);
2096 		}
2097 	}
2098 
2099 	size_t availableBytes = fReceiveQueue.Free();
2100 	// window size must remain same for duplicate acknowledgements
2101 	if (!fReceiveQueue.IsContiguous())
2102 		availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number();
2103 	segment.SetAdvertisedWindow(availableBytes, fReceiveWindowShift);
2104 
2105 	segment.acknowledge = fReceiveNext.Number();
2106 
2107 	// Process urgent data
2108 	if (fSendUrgentOffset > fSendNext) {
2109 		segment.flags |= TCP_FLAG_URGENT;
2110 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
2111 	} else {
2112 		fSendUrgentOffset = fSendUnacknowledged.Number();
2113 			// Keep urgent offset updated, so that it doesn't reach into our
2114 			// send window on overlap
2115 		segment.urgent_offset = 0;
2116 	}
2117 
2118 	return segment;
2119 }
2120 
2121 
2122 status_t
2123 TCPEndpoint::_PrepareAndSend(tcp_segment_header& segment, net_buffer* buffer,
2124 	bool isRetransmit)
2125 {
2126 	LocalAddress().CopyTo(buffer->source);
2127 	PeerAddress().CopyTo(buffer->destination);
2128 
2129 	uint32 size = buffer->size, segmentLength = size;
2130 	segment.sequence = fSendNext.Number();
2131 
2132 	TRACE("_PrepareAndSend(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2133 		"%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2134 		", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2135 		", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2136 		buffer, buffer->size, PrintAddress(buffer->source),
2137 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
2138 		segment.acknowledge, segment.advertised_window,
2139 		fCongestionWindow, fSlowStartThreshold, segmentLength,
2140 		fSendQueue.FirstSequence().Number(),
2141 		fSendQueue.LastSequence().Number());
2142 	T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2143 		fSendQueue.LastSequence()));
2144 
2145 	PROBE(buffer, sendWindow);
2146 
2147 	status_t status = add_tcp_header(AddressModule(), segment, buffer);
2148 	if (status != B_OK) {
2149 		gBufferModule->free(buffer);
2150 		return status;
2151 	}
2152 
2153 	if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2154 		segment.options &= ~TCP_HAS_WINDOW_SCALE;
2155 		segment.max_segment_size = 0;
2156 		size++;
2157 	}
2158 
2159 	if (segment.flags & TCP_FLAG_FINISH)
2160 		size++;
2161 
2162 	status = next->module->send_routed_data(next, fRoute, buffer);
2163 	if (status < B_OK) {
2164 		gBufferModule->free(buffer);
2165 		return status;
2166 	}
2167 
2168 	fSendNext += size;
2169 	if (fSendMax < fSendNext)
2170 		fSendMax = fSendNext;
2171 
2172 	fReceiveMaxAdvertised = fReceiveNext + segment.AdvertisedWindow(fReceiveWindowShift);
2173 
2174 	if (segmentLength != 0 && fState == ESTABLISHED)
2175 		--fSendMaxSegments;
2176 
2177 	if (fSendTime == 0 && !isRetransmit
2178 			&& (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)) {
2179 		fSendTime = tcp_now();
2180 		fRoundTripStartSequence = segment.sequence;
2181 	}
2182 
2183 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
2184 		fLastAcknowledgeSent = segment.acknowledge;
2185 		gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
2186 	}
2187 
2188 	return B_OK;
2189 }
2190 
2191 
2192 inline bool
2193 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
2194 	uint32 segmentMaxSize, uint32 flightSize)
2195 {
2196 	if (fState == ESTABLISHED && fSendMaxSegments == 0)
2197 		return false;
2198 
2199 	if (length > 0) {
2200 		// Avoid the silly window syndrome - we only send a segment in case:
2201 		// - we have a full segment to send, or
2202 		// - we're at the end of our buffer queue, or
2203 		// - the buffer is at least larger than half of the maximum send window,
2204 		//   or
2205 		// - we're retransmitting data
2206 		if (length == segmentMaxSize
2207 			|| (fOptions & TCP_NODELAY) != 0
2208 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
2209 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
2210 			return true;
2211 	}
2212 
2213 	// check if we need to send a window update to the peer
2214 	if (segment.advertised_window > 0) {
2215 		// correct the window to take into account what already has been advertised
2216 		uint32 window = segment.AdvertisedWindow(fReceiveWindowShift)
2217 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
2218 		if (fReceiveNext == (fReceiveMaxAdvertised + 1))
2219 			window = 0;
2220 
2221 		// if we can advertise a window larger than twice the maximum segment
2222 		// size, or half the maximum buffer size we send a window update
2223 		if (window >= (fReceiveMaxSegmentSize << 1)
2224 			|| window >= (socket->receive.buffer_size >> 1))
2225 			return true;
2226 	}
2227 
2228 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
2229 			| TCP_FLAG_RESET)) != 0)
2230 		return true;
2231 
2232 	// We do have urgent data pending
2233 	if (fSendUrgentOffset > fSendNext)
2234 		return true;
2235 
2236 	// there is no reason to send a segment just now
2237 	return false;
2238 }
2239 
2240 
2241 status_t
2242 TCPEndpoint::_SendAcknowledge(bool force)
2243 {
2244 	if (fRoute == NULL || fState == LISTEN)
2245 		return B_ERROR;
2246 
2247 	tcp_segment_header segment = _PrepareSendSegment();
2248 
2249 	// Is there actually anything to do?
2250 	if (!force && fState == ESTABLISHED
2251 			&& fLastAcknowledgeSent == fReceiveNext
2252 			&& fReceiveQueue.IsContiguous()
2253 			&& !_ShouldSendSegment(segment, 0, 0, 0))
2254 		return B_OK;
2255 
2256 	net_buffer* buffer = gBufferModule->create(256);
2257 	if (buffer == NULL)
2258 		return B_NO_MEMORY;
2259 
2260 	return _PrepareAndSend(segment, buffer, false);
2261 }
2262 
2263 
2264 /*!	Sends one or more TCP segments with the data waiting in the queue. */
2265 status_t
2266 TCPEndpoint::_SendQueued(bool force)
2267 {
2268 	if (fRoute == NULL || fState < ESTABLISHED)
2269 		return B_ERROR;
2270 
2271 	tcp_segment_header segment = _PrepareSendSegment();
2272 
2273 	uint32 sendWindow = fSendWindow;
2274 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
2275 		sendWindow = fCongestionWindow;
2276 
2277 	// fSendUnacknowledged
2278 	//  |    fSendNext      fSendMax
2279 	//  |        |              |
2280 	//  v        v              v
2281 	//  -----------------------------------
2282 	//  | effective window           |
2283 	//  -----------------------------------
2284 
2285 	// Flight size represents the window of data which is currently in the
2286 	// ether. We should never send data such as the flight size becomes larger
2287 	// than the effective window. Note however that the effective window may be
2288 	// reduced (by congestion for instance), so at some point in time flight
2289 	// size may be larger than the currently calculated window.
2290 
2291 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2292 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2293 
2294 	if (consumedWindow > sendWindow) {
2295 		sendWindow = 0;
2296 		// TODO: enter persist state? try to get a window update.
2297 	} else
2298 		sendWindow -= consumedWindow;
2299 
2300 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2301 	if (length == 0 && !state_needs_finish(fState)) {
2302 		// Nothing to send.
2303 		return B_OK;
2304 	}
2305 
2306 	bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2307 	bool retransmit = fSendNext < fSendMax;
2308 
2309 	if (fDuplicateAcknowledgeCount != 0) {
2310 		// send at most 1 SMSS of data when under limited transmit, fast transmit/recovery
2311 		length = min_c(length, fSendMaxSegmentSize);
2312 	}
2313 
2314 	do {
2315 		uint32 segmentMaxSize = fSendMaxSegmentSize
2316 			- tcp_options_length(segment);
2317 		uint32 segmentLength = min_c(length, segmentMaxSize);
2318 
2319 		if ((fSendNext + segmentLength) == fSendQueue.LastSequence() && !force) {
2320 			if (state_needs_finish(fState))
2321 				segment.flags |= TCP_FLAG_FINISH;
2322 			if (length > 0)
2323 				segment.flags |= TCP_FLAG_PUSH;
2324 		}
2325 
2326 		// Determine if we should really send this segment
2327 		if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2328 				segmentMaxSize, flightSize)) {
2329 			if (fSendQueue.Available()
2330 				&& !gStackModule->is_timer_active(&fPersistTimer)
2331 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2332 				_StartPersistTimer();
2333 			break;
2334 		}
2335 
2336 		net_buffer *buffer = gBufferModule->create(256);
2337 		if (buffer == NULL)
2338 			return B_NO_MEMORY;
2339 
2340 		status_t status = B_OK;
2341 		if (segmentLength > 0)
2342 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2343 		if (status < B_OK) {
2344 			gBufferModule->free(buffer);
2345 			return status;
2346 		}
2347 
2348 		sendWindow -= buffer->size;
2349 
2350 		status = _PrepareAndSend(segment, buffer, retransmit);
2351 		if (status != B_OK)
2352 			return status;
2353 
2354 		if (shouldStartRetransmitTimer) {
2355 			TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2356 				fRetransmitTimeout);
2357 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2358 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2359 			shouldStartRetransmitTimer = false;
2360 		}
2361 
2362 		length -= segmentLength;
2363 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2364 			| TCP_FLAG_FINISH);
2365 
2366 		if (retransmit)
2367 			break;
2368 
2369 	} while (length > 0);
2370 
2371 	return B_OK;
2372 }
2373 
2374 
2375 int
2376 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2377 {
2378 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2379 }
2380 
2381 
2382 status_t
2383 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2384 {
2385 	if (fRoute == NULL) {
2386 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2387 		if (fRoute == NULL)
2388 			return ENETUNREACH;
2389 
2390 		if ((fRoute->flags & RTF_LOCAL) != 0)
2391 			fFlags |= FLAG_LOCAL;
2392 	}
2393 
2394 	// make sure connection does not already exist
2395 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2396 		fRoute->interface_address->local);
2397 	if (status < B_OK)
2398 		return status;
2399 
2400 	fInitialSendSequence = system_time() >> 4;
2401 	fSendNext = fInitialSendSequence;
2402 	fSendUnacknowledged = fInitialSendSequence;
2403 	fSendMax = fInitialSendSequence;
2404 	fSendUrgentOffset = fInitialSendSequence;
2405 	fRecover = fInitialSendSequence.Number();
2406 
2407 	// we are counting the SYN here
2408 	fSendQueue.SetInitialSequence(fSendNext + 1);
2409 
2410 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2411 
2412 	// Compute the window shift we advertise to our peer - if it doesn't support
2413 	// this option, this will be reset to 0 (when its SYN is received)
2414 	fReceiveWindowShift = 0;
2415 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2416 			&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2417 		fReceiveWindowShift++;
2418 	}
2419 
2420 	// Increase to a default of 8 (window minimum 256 bytes, maximum 15 MB.)
2421 	if (fReceiveWindowShift < 8 && !IsLocal())
2422 		fReceiveWindowShift = 8;
2423 
2424 	return B_OK;
2425 }
2426 
2427 
2428 void
2429 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2430 {
2431 	TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2432 		B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2433 		fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2434 
2435 	ASSERT(fSendUnacknowledged <= segment.acknowledge);
2436 
2437 	if (fSendUnacknowledged < segment.acknowledge) {
2438 		fSendQueue.RemoveUntil(segment.acknowledge);
2439 
2440 		uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number();
2441 		fPreviousHighestAcknowledge = fSendUnacknowledged;
2442 		fSendUnacknowledged = segment.acknowledge;
2443 		uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2444 		int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1);
2445 
2446 		if (fPreviousHighestAcknowledge > fSendUnacknowledged) {
2447 			// need to update the recover variable upon a sequence wraparound
2448 			fRecover = segment.acknowledge - 1;
2449 		}
2450 
2451 		// the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window
2452 		if (fSendUnacknowledged != fInitialSendSequence) {
2453 			if (fCongestionWindow < fSlowStartThreshold)
2454 				fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize);
2455 			else {
2456 				uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2457 
2458 				if (increment < fCongestionWindow)
2459 					increment = 1;
2460 				else
2461 					increment /= fCongestionWindow;
2462 
2463 				fCongestionWindow += increment;
2464 			}
2465 
2466 			fSendMaxSegments = UINT32_MAX;
2467 		}
2468 
2469 		if ((fFlags & FLAG_RECOVERY) != 0) {
2470 			fSendNext = fSendUnacknowledged;
2471 			_SendQueued();
2472 			fCongestionWindow -= bytesAcknowledged;
2473 
2474 			if (bytesAcknowledged > fSendMaxSegmentSize)
2475 				fCongestionWindow += fSendMaxSegmentSize;
2476 
2477 			fSendNext = fSendMax;
2478 		} else
2479 			fDuplicateAcknowledgeCount = 0;
2480 
2481 		if (fSendNext < fSendUnacknowledged)
2482 			fSendNext = fSendUnacknowledged;
2483 
2484 		if (fFlags & FLAG_OPTION_TIMESTAMP) {
2485 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply),
2486 				expectedSamples > 0 ? expectedSamples : 1);
2487 		} else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) {
2488 			_UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1);
2489 			fSendTime = 0;
2490 		}
2491 
2492 		if (fSendUnacknowledged == fSendMax) {
2493 			TRACE("all acknowledged, cancelling retransmission timer.");
2494 			gStackModule->cancel_timer(&fRetransmitTimer);
2495 			T(TimerSet(this, "retransmit", -1));
2496 		} else {
2497 			TRACE("data acknowledged, resetting retransmission timer to: %"
2498 				B_PRIdBIGTIME, fRetransmitTimeout);
2499 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2500 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2501 		}
2502 
2503 		if (is_writable(fState)) {
2504 			// notify threads waiting on the socket to become writable again
2505 			fSendCondition.NotifyAll();
2506 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2507 		}
2508 	}
2509 
2510 	// if there is data left to be sent, send it now
2511 	if (fSendQueue.Used() > 0)
2512 		_SendQueued();
2513 }
2514 
2515 
2516 void
2517 TCPEndpoint::_Retransmit()
2518 {
2519 	TRACE("Retransmit()");
2520 
2521 	if (fState < ESTABLISHED) {
2522 		fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT;
2523 		fCongestionWindow = fSendMaxSegmentSize;
2524 	} else {
2525 		_ResetSlowStart();
2526 		fDuplicateAcknowledgeCount = 0;
2527 		// Do exponential back off of the retransmit timeout
2528 		fRetransmitTimeout *= 2;
2529 		if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2530 			fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2531 	}
2532 
2533 	fSendNext = fSendUnacknowledged;
2534 	_SendQueued();
2535 
2536 	fRecover = fSendNext.Number() - 1;
2537 	if ((fFlags & FLAG_RECOVERY) != 0)
2538 		fFlags &= ~FLAG_RECOVERY;
2539 }
2540 
2541 
2542 void
2543 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples)
2544 {
2545 	if (roundTripTime < 0)
2546 		return;
2547 
2548 	if (fSmoothedRoundTripTime <= 0) {
2549 		fSmoothedRoundTripTime = roundTripTime;
2550 		fRoundTripVariation = roundTripTime / 2;
2551 		fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2552 				* kTimestampFactor;
2553 	} else {
2554 		int32 delta = fSmoothedRoundTripTime - roundTripTime;
2555 		if (delta < 0)
2556 			delta = -delta;
2557 		fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4);
2558 		fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8);
2559 		fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2560 			* kTimestampFactor;
2561 	}
2562 
2563 	if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2564 		fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2565 
2566 	if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2567 		fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2568 
2569 	TRACE("  RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2570 		fRetransmitTimeout, roundTripTime);
2571 }
2572 
2573 
2574 void
2575 TCPEndpoint::_ResetSlowStart()
2576 {
2577 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2578 		2 * fSendMaxSegmentSize);
2579 	fCongestionWindow = fSendMaxSegmentSize;
2580 }
2581 
2582 
2583 //	#pragma mark - timer
2584 
2585 
2586 /*static*/ void
2587 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2588 {
2589 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2590 	T(TimerTriggered(endpoint, "retransmit"));
2591 
2592 	MutexLocker locker(endpoint->fLock);
2593 	if (!locker.IsLocked() || gStackModule->is_timer_active(timer))
2594 		return;
2595 
2596 	endpoint->_Retransmit();
2597 }
2598 
2599 
2600 /*static*/ void
2601 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2602 {
2603 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2604 	T(TimerTriggered(endpoint, "persist"));
2605 
2606 	MutexLocker locker(endpoint->fLock);
2607 	if (!locker.IsLocked())
2608 		return;
2609 
2610 	// the timer might not have been canceled early enough
2611 	if (endpoint->State() == CLOSED)
2612 		return;
2613 
2614 	endpoint->_SendQueued(true);
2615 }
2616 
2617 
2618 /*static*/ void
2619 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2620 {
2621 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2622 	T(TimerTriggered(endpoint, "delayed ack"));
2623 
2624 	MutexLocker locker(endpoint->fLock);
2625 	if (!locker.IsLocked())
2626 		return;
2627 
2628 	// the timer might not have been canceled early enough
2629 	if (endpoint->State() == CLOSED)
2630 		return;
2631 
2632 	endpoint->_SendAcknowledge();
2633 }
2634 
2635 
2636 /*static*/ void
2637 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2638 {
2639 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2640 	T(TimerTriggered(endpoint, "time-wait"));
2641 
2642 	MutexLocker locker(endpoint->fLock);
2643 	if (!locker.IsLocked())
2644 		return;
2645 
2646 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2647 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2648 		return;
2649 	}
2650 
2651 	locker.Unlock();
2652 
2653 	gSocketModule->release_socket(endpoint->socket);
2654 }
2655 
2656 
2657 /*static*/ status_t
2658 TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2659 	MutexLocker& locker, bigtime_t timeout)
2660 {
2661 	ConditionVariableEntry entry;
2662 	condition.Add(&entry);
2663 
2664 	locker.Unlock();
2665 	status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2666 	locker.Lock();
2667 
2668 	return result;
2669 }
2670 
2671 
2672 //	#pragma mark -
2673 
2674 
2675 void
2676 TCPEndpoint::Dump() const
2677 {
2678 	kprintf("TCP endpoint %p\n", this);
2679 	kprintf("  socket: %p\n", socket);
2680 	kprintf("  state: %s\n", name_for_state(fState));
2681 	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2682 #if KDEBUG
2683 	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2684 #endif
2685 	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2686 	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2687 	kprintf("  send\n");
2688 	kprintf("    window shift: %" B_PRIu8 "\n", fSendWindowShift);
2689 	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2690 		fSendUnacknowledged.Number());
2691 	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2692 	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2693 	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2694 	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2695 	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2696 	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2697 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2698 		fSendQueue.Size());
2699 #if DEBUG_TCP_BUFFER_QUEUE
2700 	fSendQueue.Dump();
2701 #endif
2702 	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2703 		fLastAcknowledgeSent.Number());
2704 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2705 		fInitialSendSequence.Number());
2706 	kprintf("  receive\n");
2707 	kprintf("    window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2708 	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2709 	kprintf("    max advertised: %" B_PRIu32 "\n",
2710 		fReceiveMaxAdvertised.Number());
2711 	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2712 	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2713 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2714 		fReceiveQueue.Available(), fReceiveQueue.Size());
2715 #if DEBUG_TCP_BUFFER_QUEUE
2716 	fReceiveQueue.Dump();
2717 #endif
2718 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2719 		fInitialReceiveSequence.Number());
2720 	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2721 		fDuplicateAcknowledgeCount);
2722 	kprintf("  smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2723 		fSmoothedRoundTripTime, fRoundTripVariation);
2724 	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2725 	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2726 	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2727 }
2728 
2729